Based on dataset information from kaggle, there are two columns containing a categorical type of data ("WINDOW" and "AGE_PERCENTIL"). The rest of them (229) are decimal type of data.
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import pandas as pd
import dtale
import sweetviz as sv
import pandas_profiling as pdp
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import missingno as msno
import seaborn as sns
%matplotlib inline
pd.options.display.max_columns = None
pd.options.display.max_rows = None
raw_data = pd.read_csv('covid_data.csv', header = 0)
raw_data.head()
| PATIENT_VISIT_IDENTIFIER | AGE_ABOVE65 | AGE_PERCENTIL | GENDER | DISEASE GROUPING 1 | DISEASE GROUPING 2 | DISEASE GROUPING 3 | DISEASE GROUPING 4 | DISEASE GROUPING 5 | DISEASE GROUPING 6 | HTN | IMMUNOCOMPROMISED | OTHER | ALBUMIN_MEDIAN | ALBUMIN_MEAN | ALBUMIN_MIN | ALBUMIN_MAX | ALBUMIN_DIFF | BE_ARTERIAL_MEDIAN | BE_ARTERIAL_MEAN | BE_ARTERIAL_MIN | BE_ARTERIAL_MAX | BE_ARTERIAL_DIFF | BE_VENOUS_MEDIAN | BE_VENOUS_MEAN | BE_VENOUS_MIN | BE_VENOUS_MAX | BE_VENOUS_DIFF | BIC_ARTERIAL_MEDIAN | BIC_ARTERIAL_MEAN | BIC_ARTERIAL_MIN | BIC_ARTERIAL_MAX | BIC_ARTERIAL_DIFF | BIC_VENOUS_MEDIAN | BIC_VENOUS_MEAN | BIC_VENOUS_MIN | BIC_VENOUS_MAX | BIC_VENOUS_DIFF | BILLIRUBIN_MEDIAN | BILLIRUBIN_MEAN | BILLIRUBIN_MIN | BILLIRUBIN_MAX | BILLIRUBIN_DIFF | BLAST_MEDIAN | BLAST_MEAN | BLAST_MIN | BLAST_MAX | BLAST_DIFF | CALCIUM_MEDIAN | CALCIUM_MEAN | CALCIUM_MIN | CALCIUM_MAX | CALCIUM_DIFF | CREATININ_MEDIAN | CREATININ_MEAN | CREATININ_MIN | CREATININ_MAX | CREATININ_DIFF | FFA_MEDIAN | FFA_MEAN | FFA_MIN | FFA_MAX | FFA_DIFF | GGT_MEDIAN | GGT_MEAN | GGT_MIN | GGT_MAX | GGT_DIFF | GLUCOSE_MEDIAN | GLUCOSE_MEAN | GLUCOSE_MIN | GLUCOSE_MAX | GLUCOSE_DIFF | HEMATOCRITE_MEDIAN | HEMATOCRITE_MEAN | HEMATOCRITE_MIN | HEMATOCRITE_MAX | HEMATOCRITE_DIFF | HEMOGLOBIN_MEDIAN | HEMOGLOBIN_MEAN | HEMOGLOBIN_MIN | HEMOGLOBIN_MAX | HEMOGLOBIN_DIFF | INR_MEDIAN | INR_MEAN | INR_MIN | INR_MAX | INR_DIFF | LACTATE_MEDIAN | LACTATE_MEAN | LACTATE_MIN | LACTATE_MAX | LACTATE_DIFF | LEUKOCYTES_MEDIAN | LEUKOCYTES_MEAN | LEUKOCYTES_MIN | LEUKOCYTES_MAX | LEUKOCYTES_DIFF | LINFOCITOS_MEDIAN | LINFOCITOS_MEAN | LINFOCITOS_MIN | LINFOCITOS_MAX | LINFOCITOS_DIFF | NEUTROPHILES_MEDIAN | NEUTROPHILES_MEAN | NEUTROPHILES_MIN | NEUTROPHILES_MAX | NEUTROPHILES_DIFF | P02_ARTERIAL_MEDIAN | P02_ARTERIAL_MEAN | P02_ARTERIAL_MIN | P02_ARTERIAL_MAX | P02_ARTERIAL_DIFF | P02_VENOUS_MEDIAN | P02_VENOUS_MEAN | P02_VENOUS_MIN | P02_VENOUS_MAX | P02_VENOUS_DIFF | PC02_ARTERIAL_MEDIAN | PC02_ARTERIAL_MEAN | PC02_ARTERIAL_MIN | PC02_ARTERIAL_MAX | PC02_ARTERIAL_DIFF | PC02_VENOUS_MEDIAN | PC02_VENOUS_MEAN | PC02_VENOUS_MIN | PC02_VENOUS_MAX | PC02_VENOUS_DIFF | PCR_MEDIAN | PCR_MEAN | PCR_MIN | PCR_MAX | PCR_DIFF | PH_ARTERIAL_MEDIAN | PH_ARTERIAL_MEAN | PH_ARTERIAL_MIN | PH_ARTERIAL_MAX | PH_ARTERIAL_DIFF | PH_VENOUS_MEDIAN | PH_VENOUS_MEAN | PH_VENOUS_MIN | PH_VENOUS_MAX | PH_VENOUS_DIFF | PLATELETS_MEDIAN | PLATELETS_MEAN | PLATELETS_MIN | PLATELETS_MAX | PLATELETS_DIFF | POTASSIUM_MEDIAN | POTASSIUM_MEAN | POTASSIUM_MIN | POTASSIUM_MAX | POTASSIUM_DIFF | SAT02_ARTERIAL_MEDIAN | SAT02_ARTERIAL_MEAN | SAT02_ARTERIAL_MIN | SAT02_ARTERIAL_MAX | SAT02_ARTERIAL_DIFF | SAT02_VENOUS_MEDIAN | SAT02_VENOUS_MEAN | SAT02_VENOUS_MIN | SAT02_VENOUS_MAX | SAT02_VENOUS_DIFF | SODIUM_MEDIAN | SODIUM_MEAN | SODIUM_MIN | SODIUM_MAX | SODIUM_DIFF | TGO_MEDIAN | TGO_MEAN | TGO_MIN | TGO_MAX | TGO_DIFF | TGP_MEDIAN | TGP_MEAN | TGP_MIN | TGP_MAX | TGP_DIFF | TTPA_MEDIAN | TTPA_MEAN | TTPA_MIN | TTPA_MAX | TTPA_DIFF | UREA_MEDIAN | UREA_MEAN | UREA_MIN | UREA_MAX | UREA_DIFF | DIMER_MEDIAN | DIMER_MEAN | DIMER_MIN | DIMER_MAX | DIMER_DIFF | BLOODPRESSURE_DIASTOLIC_MEAN | BLOODPRESSURE_SISTOLIC_MEAN | HEART_RATE_MEAN | RESPIRATORY_RATE_MEAN | TEMPERATURE_MEAN | OXYGEN_SATURATION_MEAN | BLOODPRESSURE_DIASTOLIC_MEDIAN | BLOODPRESSURE_SISTOLIC_MEDIAN | HEART_RATE_MEDIAN | RESPIRATORY_RATE_MEDIAN | TEMPERATURE_MEDIAN | OXYGEN_SATURATION_MEDIAN | BLOODPRESSURE_DIASTOLIC_MIN | BLOODPRESSURE_SISTOLIC_MIN | HEART_RATE_MIN | RESPIRATORY_RATE_MIN | TEMPERATURE_MIN | OXYGEN_SATURATION_MIN | BLOODPRESSURE_DIASTOLIC_MAX | BLOODPRESSURE_SISTOLIC_MAX | HEART_RATE_MAX | RESPIRATORY_RATE_MAX | TEMPERATURE_MAX | OXYGEN_SATURATION_MAX | BLOODPRESSURE_DIASTOLIC_DIFF | BLOODPRESSURE_SISTOLIC_DIFF | HEART_RATE_DIFF | RESPIRATORY_RATE_DIFF | TEMPERATURE_DIFF | OXYGEN_SATURATION_DIFF | BLOODPRESSURE_DIASTOLIC_DIFF_REL | BLOODPRESSURE_SISTOLIC_DIFF_REL | HEART_RATE_DIFF_REL | RESPIRATORY_RATE_DIFF_REL | TEMPERATURE_DIFF_REL | OXYGEN_SATURATION_DIFF_REL | WINDOW | ICU | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 60th | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.086420 | -0.230769 | -0.283019 | -0.593220 | -0.285714 | 0.736842 | 0.086420 | -0.230769 | -0.283019 | -0.586207 | -0.285714 | 0.736842 | 0.237113 | 0.0000 | -0.162393 | -0.500000 | 0.208791 | 0.898990 | -0.247863 | -0.459459 | -0.432836 | -0.636364 | -0.420290 | 0.736842 | -1.00000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | 0-2 | 0 |
| 1 | 0 | 1 | 60th | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.333333 | -0.230769 | -0.132075 | -0.593220 | 0.535714 | 0.578947 | 0.333333 | -0.230769 | -0.132075 | -0.586207 | 0.535714 | 0.578947 | 0.443299 | 0.0000 | -0.025641 | -0.500000 | 0.714286 | 0.838384 | -0.076923 | -0.459459 | -0.313433 | -0.636364 | 0.246377 | 0.578947 | -1.00000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | 2-4 | 0 |
| 2 | 0 | 1 | 60th | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.605263 | 0.605263 | 0.605263 | 0.605263 | -1.0 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.0 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.0 | -0.317073 | -0.317073 | -0.317073 | -0.317073 | -1.0 | -0.317073 | -0.317073 | -0.317073 | -0.317073 | -1.0 | -0.938950 | -0.938950 | -0.938950 | -0.938950 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | 0.183673 | 0.183673 | 0.183673 | 0.183673 | -1.0 | -0.868365 | -0.868365 | -0.868365 | -0.868365 | -1.0 | -0.742004 | -0.742004 | -0.742004 | -0.742004 | -1.0 | -0.945093 | -0.945093 | -0.945093 | -0.945093 | -1.0 | -0.891993 | -0.891993 | -0.891993 | -0.891993 | -1.0 | 0.090147 | 0.090147 | 0.090147 | 0.090147 | -1.0 | 0.109756 | 0.109756 | 0.109756 | 0.109756 | -1.0 | -0.932246 | -0.932246 | -0.932246 | -0.932246 | -1.0 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | -1.0 | -0.835844 | -0.835844 | -0.835844 | -0.835844 | -1.0 | -0.914938 | -0.914938 | -0.914938 | -0.914938 | -1.0 | -0.868747 | -0.868747 | -0.868747 | -0.868747 | -1.0 | -0.170732 | -0.170732 | -0.170732 | -0.170732 | -1.0 | -0.704142 | -0.704142 | -0.704142 | -0.704142 | -1.0 | -0.779310 | -0.779310 | -0.779310 | -0.779310 | -1.0 | -0.754601 | -0.754601 | -0.754601 | -0.754601 | -1.0 | -0.875236 | -0.875236 | -0.875236 | -0.875236 | -1.0 | 0.234043 | 0.234043 | 0.234043 | 0.234043 | -1.0 | 0.363636 | 0.363636 | 0.363636 | 0.363636 | -1.0 | -0.540721 | -0.540721 | -0.540721 | -0.540721 | -1.0 | -0.518519 | -0.518519 | -0.518519 | -0.518519 | -1.0 | 0.939394 | 0.939394 | 0.939394 | 0.939394 | -1.0 | 0.345679 | 0.345679 | 0.345679 | 0.345679 | -1.0 | -0.028571 | -0.028571 | -0.028571 | -0.028571 | -1.0 | -0.997201 | -0.997201 | -0.997201 | -0.997201 | -1.0 | -0.990854 | -0.990854 | -0.990854 | -0.990854 | -1.0 | -0.825613 | -0.825613 | -0.825613 | -0.825613 | -1.0 | -0.836145 | -0.836145 | -0.836145 | -0.836145 | -1.0 | -0.994912 | -0.994912 | -0.994912 | -0.994912 | -1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 4-6 | 0 |
| 3 | 0 | 1 | 60th | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -0.107143 | 0.736842 | NaN | NaN | NaN | NaN | -0.107143 | 0.736842 | NaN | NaN | NaN | NaN | 0.318681 | 0.898990 | NaN | NaN | NaN | NaN | -0.275362 | 0.736842 | NaN | NaN | NaN | NaN | -1.000000 | -1.000000 | NaN | NaN | NaN | NaN | -1.000000 | -1.000000 | 6-12 | 0 |
| 4 | 0 | 1 | 60th | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -1.0 | -0.871658 | -0.871658 | -0.871658 | -0.871658 | -1.0 | -0.863874 | -0.863874 | -0.863874 | -0.863874 | -1.0 | -0.317073 | -0.317073 | -0.317073 | -0.317073 | -1.0 | -0.414634 | -0.414634 | -0.414634 | -0.414634 | -1.0 | -0.979069 | -0.979069 | -0.979069 | -0.979069 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | 0.326531 | 0.326531 | 0.326531 | 0.326531 | -1.0 | -0.926398 | -0.926398 | -0.926398 | -0.926398 | -1.0 | -0.859275 | -0.859275 | -0.859275 | -0.859275 | -1.0 | -0.669393 | -0.669393 | -0.669393 | -0.669393 | -1.0 | -0.891993 | -0.891993 | -0.891993 | -0.891993 | -1.0 | -0.320755 | -0.320755 | -0.320755 | -0.320755 | -1.0 | -0.353659 | -0.353659 | -0.353659 | -0.353659 | -1.0 | -0.979925 | -0.979925 | -0.979925 | -0.979925 | -1.0 | -0.963023 | -0.963023 | -0.963023 | -0.963023 | -1.0 | -0.762843 | -0.762843 | -0.762843 | -0.762843 | -1.0 | -0.643154 | -0.643154 | -0.643154 | -0.643154 | -1.0 | -0.868747 | -0.868747 | -0.868747 | -0.868747 | -1.0 | -0.365854 | -0.365854 | -0.365854 | -0.365854 | -1.0 | -0.230769 | -0.230769 | -0.230769 | -0.230769 | -1.0 | -0.875862 | -0.875862 | -0.875862 | -0.875862 | -1.0 | -0.815951 | -0.815951 | -0.815951 | -0.815951 | -1.0 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.0 | 0.574468 | 0.574468 | 0.574468 | 0.574468 | -1.0 | 0.393939 | 0.393939 | 0.393939 | 0.393939 | -1.0 | -0.471295 | -0.471295 | -0.471295 | -0.471295 | -1.0 | -0.666667 | -0.666667 | -0.666667 | -0.666667 | -1.0 | 0.848485 | 0.848485 | 0.848485 | 0.848485 | -1.0 | 0.925926 | 0.925926 | 0.925926 | 0.925926 | -1.0 | 0.142857 | 0.142857 | 0.142857 | 0.142857 | -1.0 | -0.999067 | -0.999067 | -0.999067 | -0.999067 | -1.0 | -0.983994 | -0.983994 | -0.983994 | -0.983994 | -1.0 | -0.846633 | -0.846633 | -0.846633 | -0.846633 | -1.0 | -0.836145 | -0.836145 | -0.836145 | -0.836145 | -1.0 | -0.996762 | -0.996762 | -0.996762 | -0.996762 | -1.0 | -0.243021 | -0.338537 | -0.213031 | -0.317859 | 0.033779 | 0.665932 | -0.283951 | -0.376923 | -0.188679 | -0.379310 | 0.035714 | 0.631579 | -0.340206 | -0.4875 | -0.572650 | -0.857143 | 0.098901 | 0.797980 | -0.076923 | 0.286486 | 0.298507 | 0.272727 | 0.362319 | 0.947368 | -0.33913 | 0.325153 | 0.114504 | 0.176471 | -0.238095 | -0.818182 | -0.389967 | 0.407558 | -0.230462 | 0.096774 | -0.242282 | -0.814433 | ABOVE_12 | 1 |
raw_data.tail()
| PATIENT_VISIT_IDENTIFIER | AGE_ABOVE65 | AGE_PERCENTIL | GENDER | DISEASE GROUPING 1 | DISEASE GROUPING 2 | DISEASE GROUPING 3 | DISEASE GROUPING 4 | DISEASE GROUPING 5 | DISEASE GROUPING 6 | HTN | IMMUNOCOMPROMISED | OTHER | ALBUMIN_MEDIAN | ALBUMIN_MEAN | ALBUMIN_MIN | ALBUMIN_MAX | ALBUMIN_DIFF | BE_ARTERIAL_MEDIAN | BE_ARTERIAL_MEAN | BE_ARTERIAL_MIN | BE_ARTERIAL_MAX | BE_ARTERIAL_DIFF | BE_VENOUS_MEDIAN | BE_VENOUS_MEAN | BE_VENOUS_MIN | BE_VENOUS_MAX | BE_VENOUS_DIFF | BIC_ARTERIAL_MEDIAN | BIC_ARTERIAL_MEAN | BIC_ARTERIAL_MIN | BIC_ARTERIAL_MAX | BIC_ARTERIAL_DIFF | BIC_VENOUS_MEDIAN | BIC_VENOUS_MEAN | BIC_VENOUS_MIN | BIC_VENOUS_MAX | BIC_VENOUS_DIFF | BILLIRUBIN_MEDIAN | BILLIRUBIN_MEAN | BILLIRUBIN_MIN | BILLIRUBIN_MAX | BILLIRUBIN_DIFF | BLAST_MEDIAN | BLAST_MEAN | BLAST_MIN | BLAST_MAX | BLAST_DIFF | CALCIUM_MEDIAN | CALCIUM_MEAN | CALCIUM_MIN | CALCIUM_MAX | CALCIUM_DIFF | CREATININ_MEDIAN | CREATININ_MEAN | CREATININ_MIN | CREATININ_MAX | CREATININ_DIFF | FFA_MEDIAN | FFA_MEAN | FFA_MIN | FFA_MAX | FFA_DIFF | GGT_MEDIAN | GGT_MEAN | GGT_MIN | GGT_MAX | GGT_DIFF | GLUCOSE_MEDIAN | GLUCOSE_MEAN | GLUCOSE_MIN | GLUCOSE_MAX | GLUCOSE_DIFF | HEMATOCRITE_MEDIAN | HEMATOCRITE_MEAN | HEMATOCRITE_MIN | HEMATOCRITE_MAX | HEMATOCRITE_DIFF | HEMOGLOBIN_MEDIAN | HEMOGLOBIN_MEAN | HEMOGLOBIN_MIN | HEMOGLOBIN_MAX | HEMOGLOBIN_DIFF | INR_MEDIAN | INR_MEAN | INR_MIN | INR_MAX | INR_DIFF | LACTATE_MEDIAN | LACTATE_MEAN | LACTATE_MIN | LACTATE_MAX | LACTATE_DIFF | LEUKOCYTES_MEDIAN | LEUKOCYTES_MEAN | LEUKOCYTES_MIN | LEUKOCYTES_MAX | LEUKOCYTES_DIFF | LINFOCITOS_MEDIAN | LINFOCITOS_MEAN | LINFOCITOS_MIN | LINFOCITOS_MAX | LINFOCITOS_DIFF | NEUTROPHILES_MEDIAN | NEUTROPHILES_MEAN | NEUTROPHILES_MIN | NEUTROPHILES_MAX | NEUTROPHILES_DIFF | P02_ARTERIAL_MEDIAN | P02_ARTERIAL_MEAN | P02_ARTERIAL_MIN | P02_ARTERIAL_MAX | P02_ARTERIAL_DIFF | P02_VENOUS_MEDIAN | P02_VENOUS_MEAN | P02_VENOUS_MIN | P02_VENOUS_MAX | P02_VENOUS_DIFF | PC02_ARTERIAL_MEDIAN | PC02_ARTERIAL_MEAN | PC02_ARTERIAL_MIN | PC02_ARTERIAL_MAX | PC02_ARTERIAL_DIFF | PC02_VENOUS_MEDIAN | PC02_VENOUS_MEAN | PC02_VENOUS_MIN | PC02_VENOUS_MAX | PC02_VENOUS_DIFF | PCR_MEDIAN | PCR_MEAN | PCR_MIN | PCR_MAX | PCR_DIFF | PH_ARTERIAL_MEDIAN | PH_ARTERIAL_MEAN | PH_ARTERIAL_MIN | PH_ARTERIAL_MAX | PH_ARTERIAL_DIFF | PH_VENOUS_MEDIAN | PH_VENOUS_MEAN | PH_VENOUS_MIN | PH_VENOUS_MAX | PH_VENOUS_DIFF | PLATELETS_MEDIAN | PLATELETS_MEAN | PLATELETS_MIN | PLATELETS_MAX | PLATELETS_DIFF | POTASSIUM_MEDIAN | POTASSIUM_MEAN | POTASSIUM_MIN | POTASSIUM_MAX | POTASSIUM_DIFF | SAT02_ARTERIAL_MEDIAN | SAT02_ARTERIAL_MEAN | SAT02_ARTERIAL_MIN | SAT02_ARTERIAL_MAX | SAT02_ARTERIAL_DIFF | SAT02_VENOUS_MEDIAN | SAT02_VENOUS_MEAN | SAT02_VENOUS_MIN | SAT02_VENOUS_MAX | SAT02_VENOUS_DIFF | SODIUM_MEDIAN | SODIUM_MEAN | SODIUM_MIN | SODIUM_MAX | SODIUM_DIFF | TGO_MEDIAN | TGO_MEAN | TGO_MIN | TGO_MAX | TGO_DIFF | TGP_MEDIAN | TGP_MEAN | TGP_MIN | TGP_MAX | TGP_DIFF | TTPA_MEDIAN | TTPA_MEAN | TTPA_MIN | TTPA_MAX | TTPA_DIFF | UREA_MEDIAN | UREA_MEAN | UREA_MIN | UREA_MAX | UREA_DIFF | DIMER_MEDIAN | DIMER_MEAN | DIMER_MIN | DIMER_MAX | DIMER_DIFF | BLOODPRESSURE_DIASTOLIC_MEAN | BLOODPRESSURE_SISTOLIC_MEAN | HEART_RATE_MEAN | RESPIRATORY_RATE_MEAN | TEMPERATURE_MEAN | OXYGEN_SATURATION_MEAN | BLOODPRESSURE_DIASTOLIC_MEDIAN | BLOODPRESSURE_SISTOLIC_MEDIAN | HEART_RATE_MEDIAN | RESPIRATORY_RATE_MEDIAN | TEMPERATURE_MEDIAN | OXYGEN_SATURATION_MEDIAN | BLOODPRESSURE_DIASTOLIC_MIN | BLOODPRESSURE_SISTOLIC_MIN | HEART_RATE_MIN | RESPIRATORY_RATE_MIN | TEMPERATURE_MIN | OXYGEN_SATURATION_MIN | BLOODPRESSURE_DIASTOLIC_MAX | BLOODPRESSURE_SISTOLIC_MAX | HEART_RATE_MAX | RESPIRATORY_RATE_MAX | TEMPERATURE_MAX | OXYGEN_SATURATION_MAX | BLOODPRESSURE_DIASTOLIC_DIFF | BLOODPRESSURE_SISTOLIC_DIFF | HEART_RATE_DIFF | RESPIRATORY_RATE_DIFF | TEMPERATURE_DIFF | OXYGEN_SATURATION_DIFF | BLOODPRESSURE_DIASTOLIC_DIFF_REL | BLOODPRESSURE_SISTOLIC_DIFF_REL | HEART_RATE_DIFF_REL | RESPIRATORY_RATE_DIFF_REL | TEMPERATURE_DIFF_REL | OXYGEN_SATURATION_DIFF_REL | WINDOW | ICU | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1920 | 384 | 0 | 50th | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.012346 | -0.292308 | 0.056604 | -0.525424 | 0.535714 | 0.789474 | 0.012346 | -0.292308 | 0.056604 | -0.517241 | 0.535714 | 0.789474 | 0.175258 | -0.050 | 0.145299 | -0.428571 | 0.714286 | 0.919192 | -0.299145 | -0.502703 | -0.164179 | -0.575758 | 0.246377 | 0.789474 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | 0-2 | 0 |
| 1921 | 384 | 0 | 50th | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.605263 | 0.605263 | 0.605263 | 0.605263 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -0.717277 | -0.717277 | -0.717277 | -0.717277 | -1.0 | -0.317073 | -0.317073 | -0.317073 | -0.317073 | -1.0 | -0.170732 | -0.170732 | -0.170732 | -0.170732 | -1.0 | -0.982208 | -0.982208 | -0.982208 | -0.982208 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | 0.244898 | 0.244898 | 0.244898 | 0.244898 | -1.0 | -0.934890 | -0.934890 | -0.934890 | -0.934890 | -1.0 | -0.782516 | -0.782516 | -0.782516 | -0.782516 | -1.0 | -0.960280 | -0.960280 | -0.960280 | -0.960280 | -1.0 | -0.862197 | -0.862197 | -0.862197 | -0.862197 | -1.0 | -0.064990 | -0.064990 | -0.064990 | -0.064990 | -1.0 | -0.158537 | -0.158537 | -0.158537 | -0.158537 | -1.0 | -0.957340 | -0.957340 | -0.957340 | -0.957340 | -1.0 | -0.897773 | -0.897773 | -0.897773 | -0.897773 | -1.0 | -0.848590 | -0.848590 | -0.848590 | -0.848590 | -1.0 | -0.686722 | -0.686722 | -0.686722 | -0.686722 | -1.0 | -0.913165 | -0.913165 | -0.913165 | -0.913165 | -1.0 | -0.170732 | -0.170732 | -0.170732 | -0.170732 | -1.0 | -0.857988 | -0.857988 | -0.857988 | -0.857988 | -1.0 | -0.77931 | -0.77931 | -0.77931 | -0.77931 | -1.0 | -0.730061 | -0.730061 | -0.730061 | -0.730061 | -1.0 | -0.906238 | -0.906238 | -0.906238 | -0.906238 | -1.0 | 0.234043 | 0.234043 | 0.234043 | 0.234043 | -1.0 | 0.424242 | 0.424242 | 0.424242 | 0.424242 | -1.0 | -0.479306 | -0.479306 | -0.479306 | -0.479306 | -1.0 | -0.333333 | -0.333333 | -0.333333 | -0.333333 | -1.0 | 0.939394 | 0.939394 | 0.939394 | 0.939394 | -1.0 | -0.333333 | -0.333333 | -0.333333 | -0.333333 | -1.0 | -0.085714 | -0.085714 | -0.085714 | -0.085714 | -1.0 | -0.997387 | -0.997387 | -0.997387 | -0.997387 | -1.0 | -0.992378 | -0.992378 | -0.992378 | -0.992378 | -1.0 | -0.869210 | -0.869210 | -0.869210 | -0.869210 | -1.0 | -0.879518 | -0.879518 | -0.879518 | -0.879518 | -1.0 | -0.979571 | -0.979571 | -0.979571 | -0.979571 | -1.0 | 0.086420 | -0.384615 | -0.113208 | -0.593220 | 0.142857 | 0.578947 | 0.086420 | -0.384615 | -0.113208 | -0.586207 | 0.142857 | 0.578947 | 0.237113 | -0.125 | -0.008547 | -0.500000 | 0.472527 | 0.838384 | -0.247863 | -0.567568 | -0.298507 | -0.636364 | -0.072464 | 0.578947 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | 2-4 | 0 |
| 1922 | 384 | 0 | 50th | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.086420 | -0.230769 | -0.169811 | -0.593220 | 0.142857 | 0.736842 | 0.086420 | -0.230769 | -0.169811 | -0.586207 | 0.142857 | 0.736842 | 0.237113 | 0.000 | -0.059829 | -0.500000 | 0.472527 | 0.898990 | -0.247863 | -0.459459 | -0.343284 | -0.636364 | -0.072464 | 0.736842 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | 4-6 | 0 |
| 1923 | 384 | 0 | 50th | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.209877 | -0.384615 | -0.188679 | -0.661017 | 0.285714 | 0.473684 | 0.209877 | -0.384615 | -0.188679 | -0.655172 | 0.285714 | 0.473684 | 0.340206 | -0.125 | -0.076923 | -0.571429 | 0.560440 | 0.797980 | -0.162393 | -0.567568 | -0.358209 | -0.696970 | 0.043478 | 0.473684 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | 6-12 | 0 |
| 1924 | 384 | 0 | 50th | 1 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.605263 | 0.605263 | 0.605263 | 0.605263 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.0 | -0.317073 | -0.317073 | -0.317073 | -0.317073 | -1.0 | -0.317073 | -0.317073 | -0.317073 | -0.317073 | -1.0 | -0.983255 | -0.983255 | -0.983255 | -0.983255 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | 0.306122 | 0.306122 | 0.306122 | 0.306122 | -1.0 | -0.944798 | -0.944798 | -0.944798 | -0.944798 | -1.0 | -0.825160 | -0.825160 | -0.825160 | -0.825160 | -1.0 | -0.962617 | -0.962617 | -0.962617 | -0.962617 | -1.0 | -0.891993 | -0.891993 | -0.891993 | -0.891993 | -1.0 | -0.157233 | -0.157233 | -0.157233 | -0.157233 | -1.0 | -0.292683 | -0.292683 | -0.292683 | -0.292683 | -1.0 | -0.959849 | -0.959849 | -0.959849 | -0.959849 | -1.0 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | -1.0 | -0.850521 | -0.850521 | -0.850521 | -0.850521 | -1.0 | -0.634855 | -0.634855 | -0.634855 | -0.634855 | -1.0 | -0.935974 | -0.935974 | -0.935974 | -0.935974 | -1.0 | -0.170732 | -0.170732 | -0.170732 | -0.170732 | -1.0 | -0.704142 | -0.704142 | -0.704142 | -0.704142 | -1.0 | -0.77931 | -0.77931 | -0.77931 | -0.77931 | -1.0 | -0.754601 | -0.754601 | -0.754601 | -0.754601 | -1.0 | -0.801134 | -0.801134 | -0.801134 | -0.801134 | -1.0 | 0.234043 | 0.234043 | 0.234043 | 0.234043 | -1.0 | 0.363636 | 0.363636 | 0.363636 | 0.363636 | -1.0 | -0.463284 | -0.463284 | -0.463284 | -0.463284 | -1.0 | -0.444444 | -0.444444 | -0.444444 | -0.444444 | -1.0 | 0.939394 | 0.939394 | 0.939394 | 0.939394 | -1.0 | 0.345679 | 0.345679 | 0.345679 | 0.345679 | -1.0 | 0.200000 | 0.200000 | 0.200000 | 0.200000 | -1.0 | -0.997761 | -0.997761 | -0.997761 | -0.997761 | -1.0 | -0.991997 | -0.991997 | -0.991997 | -0.991997 | -1.0 | -0.846633 | -0.846633 | -0.846633 | -0.846633 | -1.0 | -0.807229 | -0.807229 | -0.807229 | -0.807229 | -1.0 | -0.888448 | -0.888448 | -0.888448 | -0.888448 | -1.0 | -0.185185 | -0.539103 | -0.107704 | -0.610169 | 0.050595 | 0.662281 | -0.160494 | -0.538462 | -0.075472 | -0.586207 | 0.071429 | 0.631579 | -0.175258 | -0.375 | -0.247863 | -0.785714 | 0.186813 | 0.777778 | -0.247863 | -0.470270 | -0.149254 | -0.515152 | 0.101449 | 0.842105 | -0.652174 | -0.644172 | -0.633588 | -0.647059 | -0.547619 | -0.838384 | -0.701863 | -0.585967 | -0.763868 | -0.612903 | -0.551337 | -0.835052 | ABOVE_12 | 0 |
raw_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1925 entries, 0 to 1924 Columns: 231 entries, PATIENT_VISIT_IDENTIFIER to ICU dtypes: float64(225), int64(4), object(2) memory usage: 3.4+ MB
number_of_patients = raw_data['PATIENT_VISIT_IDENTIFIER'].max()+1
print(f"The total number of patients from the dataset is {number_of_patients}")
The total number of patients from the dataset is 385
msno.bar(raw_data)
<AxesSubplot:>
Observations:
print('NaN values =', raw_data.isnull().sum().sum())
print("""""")
vars_with_missing = []
for feature in raw_data.columns:
missings = raw_data[feature].isna().sum()
if missings > 0 :
vars_with_missing.append(feature)
missings_perc = missings / raw_data.shape[0]
print('Variable {} has {} records ({:.2%}) with missing values.'.format(feature, missings, missings_perc))
print('In total, there are {} variables with missing values'.format(len(vars_with_missing)))
NaN values = 223863 Variable DISEASE GROUPING 1 has 5 records (0.26%) with missing values. Variable DISEASE GROUPING 2 has 5 records (0.26%) with missing values. Variable DISEASE GROUPING 3 has 5 records (0.26%) with missing values. Variable DISEASE GROUPING 4 has 5 records (0.26%) with missing values. Variable DISEASE GROUPING 5 has 5 records (0.26%) with missing values. Variable DISEASE GROUPING 6 has 5 records (0.26%) with missing values. Variable HTN has 5 records (0.26%) with missing values. Variable IMMUNOCOMPROMISED has 5 records (0.26%) with missing values. Variable OTHER has 5 records (0.26%) with missing values. Variable ALBUMIN_MEDIAN has 1104 records (57.35%) with missing values. Variable ALBUMIN_MEAN has 1104 records (57.35%) with missing values. Variable ALBUMIN_MIN has 1104 records (57.35%) with missing values. Variable ALBUMIN_MAX has 1104 records (57.35%) with missing values. Variable ALBUMIN_DIFF has 1104 records (57.35%) with missing values. Variable BE_ARTERIAL_MEDIAN has 1104 records (57.35%) with missing values. Variable BE_ARTERIAL_MEAN has 1104 records (57.35%) with missing values. Variable BE_ARTERIAL_MIN has 1104 records (57.35%) with missing values. Variable BE_ARTERIAL_MAX has 1104 records (57.35%) with missing values. Variable BE_ARTERIAL_DIFF has 1104 records (57.35%) with missing values. Variable BE_VENOUS_MEDIAN has 1104 records (57.35%) with missing values. Variable BE_VENOUS_MEAN has 1104 records (57.35%) with missing values. Variable BE_VENOUS_MIN has 1104 records (57.35%) with missing values. Variable BE_VENOUS_MAX has 1104 records (57.35%) with missing values. Variable BE_VENOUS_DIFF has 1104 records (57.35%) with missing values. Variable BIC_ARTERIAL_MEDIAN has 1104 records (57.35%) with missing values. Variable BIC_ARTERIAL_MEAN has 1104 records (57.35%) with missing values. Variable BIC_ARTERIAL_MIN has 1104 records (57.35%) with missing values. Variable BIC_ARTERIAL_MAX has 1104 records (57.35%) with missing values. Variable BIC_ARTERIAL_DIFF has 1104 records (57.35%) with missing values. Variable BIC_VENOUS_MEDIAN has 1104 records (57.35%) with missing values. Variable BIC_VENOUS_MEAN has 1104 records (57.35%) with missing values. Variable BIC_VENOUS_MIN has 1104 records (57.35%) with missing values. Variable BIC_VENOUS_MAX has 1104 records (57.35%) with missing values. Variable BIC_VENOUS_DIFF has 1104 records (57.35%) with missing values. Variable BILLIRUBIN_MEDIAN has 1104 records (57.35%) with missing values. Variable BILLIRUBIN_MEAN has 1104 records (57.35%) with missing values. Variable BILLIRUBIN_MIN has 1104 records (57.35%) with missing values. Variable BILLIRUBIN_MAX has 1104 records (57.35%) with missing values. Variable BILLIRUBIN_DIFF has 1104 records (57.35%) with missing values. Variable BLAST_MEDIAN has 1104 records (57.35%) with missing values. Variable BLAST_MEAN has 1104 records (57.35%) with missing values. Variable BLAST_MIN has 1104 records (57.35%) with missing values. Variable BLAST_MAX has 1104 records (57.35%) with missing values. Variable BLAST_DIFF has 1104 records (57.35%) with missing values. Variable CALCIUM_MEDIAN has 1104 records (57.35%) with missing values. Variable CALCIUM_MEAN has 1104 records (57.35%) with missing values. Variable CALCIUM_MIN has 1104 records (57.35%) with missing values. Variable CALCIUM_MAX has 1104 records (57.35%) with missing values. Variable CALCIUM_DIFF has 1104 records (57.35%) with missing values. Variable CREATININ_MEDIAN has 1104 records (57.35%) with missing values. Variable CREATININ_MEAN has 1104 records (57.35%) with missing values. Variable CREATININ_MIN has 1104 records (57.35%) with missing values. Variable CREATININ_MAX has 1104 records (57.35%) with missing values. Variable CREATININ_DIFF has 1104 records (57.35%) with missing values. Variable FFA_MEDIAN has 1104 records (57.35%) with missing values. Variable FFA_MEAN has 1104 records (57.35%) with missing values. Variable FFA_MIN has 1104 records (57.35%) with missing values. Variable FFA_MAX has 1104 records (57.35%) with missing values. Variable FFA_DIFF has 1104 records (57.35%) with missing values. Variable GGT_MEDIAN has 1104 records (57.35%) with missing values. Variable GGT_MEAN has 1104 records (57.35%) with missing values. Variable GGT_MIN has 1104 records (57.35%) with missing values. Variable GGT_MAX has 1104 records (57.35%) with missing values. Variable GGT_DIFF has 1104 records (57.35%) with missing values. Variable GLUCOSE_MEDIAN has 1104 records (57.35%) with missing values. Variable GLUCOSE_MEAN has 1104 records (57.35%) with missing values. Variable GLUCOSE_MIN has 1104 records (57.35%) with missing values. Variable GLUCOSE_MAX has 1104 records (57.35%) with missing values. Variable GLUCOSE_DIFF has 1104 records (57.35%) with missing values. Variable HEMATOCRITE_MEDIAN has 1104 records (57.35%) with missing values. Variable HEMATOCRITE_MEAN has 1104 records (57.35%) with missing values. Variable HEMATOCRITE_MIN has 1104 records (57.35%) with missing values. Variable HEMATOCRITE_MAX has 1104 records (57.35%) with missing values. Variable HEMATOCRITE_DIFF has 1104 records (57.35%) with missing values. Variable HEMOGLOBIN_MEDIAN has 1104 records (57.35%) with missing values. Variable HEMOGLOBIN_MEAN has 1104 records (57.35%) with missing values. Variable HEMOGLOBIN_MIN has 1104 records (57.35%) with missing values. Variable HEMOGLOBIN_MAX has 1104 records (57.35%) with missing values. Variable HEMOGLOBIN_DIFF has 1104 records (57.35%) with missing values. Variable INR_MEDIAN has 1104 records (57.35%) with missing values. Variable INR_MEAN has 1104 records (57.35%) with missing values. Variable INR_MIN has 1104 records (57.35%) with missing values. Variable INR_MAX has 1104 records (57.35%) with missing values. Variable INR_DIFF has 1104 records (57.35%) with missing values. Variable LACTATE_MEDIAN has 1104 records (57.35%) with missing values. Variable LACTATE_MEAN has 1104 records (57.35%) with missing values. Variable LACTATE_MIN has 1104 records (57.35%) with missing values. Variable LACTATE_MAX has 1104 records (57.35%) with missing values. Variable LACTATE_DIFF has 1104 records (57.35%) with missing values. Variable LEUKOCYTES_MEDIAN has 1104 records (57.35%) with missing values. Variable LEUKOCYTES_MEAN has 1104 records (57.35%) with missing values. Variable LEUKOCYTES_MIN has 1104 records (57.35%) with missing values. Variable LEUKOCYTES_MAX has 1104 records (57.35%) with missing values. Variable LEUKOCYTES_DIFF has 1104 records (57.35%) with missing values. Variable LINFOCITOS_MEDIAN has 1104 records (57.35%) with missing values. Variable LINFOCITOS_MEAN has 1104 records (57.35%) with missing values. Variable LINFOCITOS_MIN has 1104 records (57.35%) with missing values. Variable LINFOCITOS_MAX has 1104 records (57.35%) with missing values. Variable LINFOCITOS_DIFF has 1104 records (57.35%) with missing values. Variable NEUTROPHILES_MEDIAN has 1104 records (57.35%) with missing values. Variable NEUTROPHILES_MEAN has 1104 records (57.35%) with missing values. Variable NEUTROPHILES_MIN has 1104 records (57.35%) with missing values. Variable NEUTROPHILES_MAX has 1104 records (57.35%) with missing values. Variable NEUTROPHILES_DIFF has 1104 records (57.35%) with missing values. Variable P02_ARTERIAL_MEDIAN has 1104 records (57.35%) with missing values. Variable P02_ARTERIAL_MEAN has 1104 records (57.35%) with missing values. Variable P02_ARTERIAL_MIN has 1104 records (57.35%) with missing values. Variable P02_ARTERIAL_MAX has 1104 records (57.35%) with missing values. Variable P02_ARTERIAL_DIFF has 1104 records (57.35%) with missing values. Variable P02_VENOUS_MEDIAN has 1104 records (57.35%) with missing values. Variable P02_VENOUS_MEAN has 1104 records (57.35%) with missing values. Variable P02_VENOUS_MIN has 1104 records (57.35%) with missing values. Variable P02_VENOUS_MAX has 1104 records (57.35%) with missing values. Variable P02_VENOUS_DIFF has 1104 records (57.35%) with missing values. Variable PC02_ARTERIAL_MEDIAN has 1104 records (57.35%) with missing values. Variable PC02_ARTERIAL_MEAN has 1104 records (57.35%) with missing values. Variable PC02_ARTERIAL_MIN has 1104 records (57.35%) with missing values. Variable PC02_ARTERIAL_MAX has 1104 records (57.35%) with missing values. Variable PC02_ARTERIAL_DIFF has 1104 records (57.35%) with missing values. Variable PC02_VENOUS_MEDIAN has 1104 records (57.35%) with missing values. Variable PC02_VENOUS_MEAN has 1104 records (57.35%) with missing values. Variable PC02_VENOUS_MIN has 1104 records (57.35%) with missing values. Variable PC02_VENOUS_MAX has 1104 records (57.35%) with missing values. Variable PC02_VENOUS_DIFF has 1104 records (57.35%) with missing values. Variable PCR_MEDIAN has 1104 records (57.35%) with missing values. Variable PCR_MEAN has 1104 records (57.35%) with missing values. Variable PCR_MIN has 1104 records (57.35%) with missing values. Variable PCR_MAX has 1104 records (57.35%) with missing values. Variable PCR_DIFF has 1104 records (57.35%) with missing values. Variable PH_ARTERIAL_MEDIAN has 1104 records (57.35%) with missing values. Variable PH_ARTERIAL_MEAN has 1104 records (57.35%) with missing values. Variable PH_ARTERIAL_MIN has 1104 records (57.35%) with missing values. Variable PH_ARTERIAL_MAX has 1104 records (57.35%) with missing values. Variable PH_ARTERIAL_DIFF has 1104 records (57.35%) with missing values. Variable PH_VENOUS_MEDIAN has 1104 records (57.35%) with missing values. Variable PH_VENOUS_MEAN has 1104 records (57.35%) with missing values. Variable PH_VENOUS_MIN has 1104 records (57.35%) with missing values. Variable PH_VENOUS_MAX has 1104 records (57.35%) with missing values. Variable PH_VENOUS_DIFF has 1104 records (57.35%) with missing values. Variable PLATELETS_MEDIAN has 1104 records (57.35%) with missing values. Variable PLATELETS_MEAN has 1104 records (57.35%) with missing values. Variable PLATELETS_MIN has 1104 records (57.35%) with missing values. Variable PLATELETS_MAX has 1104 records (57.35%) with missing values. Variable PLATELETS_DIFF has 1104 records (57.35%) with missing values. Variable POTASSIUM_MEDIAN has 1104 records (57.35%) with missing values. Variable POTASSIUM_MEAN has 1104 records (57.35%) with missing values. Variable POTASSIUM_MIN has 1104 records (57.35%) with missing values. Variable POTASSIUM_MAX has 1104 records (57.35%) with missing values. Variable POTASSIUM_DIFF has 1104 records (57.35%) with missing values. Variable SAT02_ARTERIAL_MEDIAN has 1104 records (57.35%) with missing values. Variable SAT02_ARTERIAL_MEAN has 1104 records (57.35%) with missing values. Variable SAT02_ARTERIAL_MIN has 1104 records (57.35%) with missing values. Variable SAT02_ARTERIAL_MAX has 1104 records (57.35%) with missing values. Variable SAT02_ARTERIAL_DIFF has 1104 records (57.35%) with missing values. Variable SAT02_VENOUS_MEDIAN has 1104 records (57.35%) with missing values. Variable SAT02_VENOUS_MEAN has 1104 records (57.35%) with missing values. Variable SAT02_VENOUS_MIN has 1104 records (57.35%) with missing values. Variable SAT02_VENOUS_MAX has 1104 records (57.35%) with missing values. Variable SAT02_VENOUS_DIFF has 1104 records (57.35%) with missing values. Variable SODIUM_MEDIAN has 1104 records (57.35%) with missing values. Variable SODIUM_MEAN has 1104 records (57.35%) with missing values. Variable SODIUM_MIN has 1104 records (57.35%) with missing values. Variable SODIUM_MAX has 1104 records (57.35%) with missing values. Variable SODIUM_DIFF has 1104 records (57.35%) with missing values. Variable TGO_MEDIAN has 1104 records (57.35%) with missing values. Variable TGO_MEAN has 1104 records (57.35%) with missing values. Variable TGO_MIN has 1104 records (57.35%) with missing values. Variable TGO_MAX has 1104 records (57.35%) with missing values. Variable TGO_DIFF has 1104 records (57.35%) with missing values. Variable TGP_MEDIAN has 1104 records (57.35%) with missing values. Variable TGP_MEAN has 1104 records (57.35%) with missing values. Variable TGP_MIN has 1104 records (57.35%) with missing values. Variable TGP_MAX has 1104 records (57.35%) with missing values. Variable TGP_DIFF has 1104 records (57.35%) with missing values. Variable TTPA_MEDIAN has 1104 records (57.35%) with missing values. Variable TTPA_MEAN has 1104 records (57.35%) with missing values. Variable TTPA_MIN has 1104 records (57.35%) with missing values. Variable TTPA_MAX has 1104 records (57.35%) with missing values. Variable TTPA_DIFF has 1104 records (57.35%) with missing values. Variable UREA_MEDIAN has 1104 records (57.35%) with missing values. Variable UREA_MEAN has 1104 records (57.35%) with missing values. Variable UREA_MIN has 1104 records (57.35%) with missing values. Variable UREA_MAX has 1104 records (57.35%) with missing values. Variable UREA_DIFF has 1104 records (57.35%) with missing values. Variable DIMER_MEDIAN has 1104 records (57.35%) with missing values. Variable DIMER_MEAN has 1104 records (57.35%) with missing values. Variable DIMER_MIN has 1104 records (57.35%) with missing values. Variable DIMER_MAX has 1104 records (57.35%) with missing values. Variable DIMER_DIFF has 1104 records (57.35%) with missing values. Variable BLOODPRESSURE_DIASTOLIC_MEAN has 685 records (35.58%) with missing values. Variable BLOODPRESSURE_SISTOLIC_MEAN has 685 records (35.58%) with missing values. Variable HEART_RATE_MEAN has 685 records (35.58%) with missing values. Variable RESPIRATORY_RATE_MEAN has 748 records (38.86%) with missing values. Variable TEMPERATURE_MEAN has 694 records (36.05%) with missing values. Variable OXYGEN_SATURATION_MEAN has 686 records (35.64%) with missing values. Variable BLOODPRESSURE_DIASTOLIC_MEDIAN has 685 records (35.58%) with missing values. Variable BLOODPRESSURE_SISTOLIC_MEDIAN has 685 records (35.58%) with missing values. Variable HEART_RATE_MEDIAN has 685 records (35.58%) with missing values. Variable RESPIRATORY_RATE_MEDIAN has 748 records (38.86%) with missing values. Variable TEMPERATURE_MEDIAN has 694 records (36.05%) with missing values. Variable OXYGEN_SATURATION_MEDIAN has 686 records (35.64%) with missing values. Variable BLOODPRESSURE_DIASTOLIC_MIN has 685 records (35.58%) with missing values. Variable BLOODPRESSURE_SISTOLIC_MIN has 685 records (35.58%) with missing values. Variable HEART_RATE_MIN has 685 records (35.58%) with missing values. Variable RESPIRATORY_RATE_MIN has 748 records (38.86%) with missing values. Variable TEMPERATURE_MIN has 694 records (36.05%) with missing values. Variable OXYGEN_SATURATION_MIN has 686 records (35.64%) with missing values. Variable BLOODPRESSURE_DIASTOLIC_MAX has 685 records (35.58%) with missing values. Variable BLOODPRESSURE_SISTOLIC_MAX has 685 records (35.58%) with missing values. Variable HEART_RATE_MAX has 685 records (35.58%) with missing values. Variable RESPIRATORY_RATE_MAX has 748 records (38.86%) with missing values. Variable TEMPERATURE_MAX has 694 records (36.05%) with missing values. Variable OXYGEN_SATURATION_MAX has 686 records (35.64%) with missing values. Variable BLOODPRESSURE_DIASTOLIC_DIFF has 685 records (35.58%) with missing values. Variable BLOODPRESSURE_SISTOLIC_DIFF has 685 records (35.58%) with missing values. Variable HEART_RATE_DIFF has 685 records (35.58%) with missing values. Variable RESPIRATORY_RATE_DIFF has 748 records (38.86%) with missing values. Variable TEMPERATURE_DIFF has 694 records (36.05%) with missing values. Variable OXYGEN_SATURATION_DIFF has 686 records (35.64%) with missing values. Variable BLOODPRESSURE_DIASTOLIC_DIFF_REL has 685 records (35.58%) with missing values. Variable BLOODPRESSURE_SISTOLIC_DIFF_REL has 685 records (35.58%) with missing values. Variable HEART_RATE_DIFF_REL has 685 records (35.58%) with missing values. Variable RESPIRATORY_RATE_DIFF_REL has 748 records (38.86%) with missing values. Variable TEMPERATURE_DIFF_REL has 694 records (36.05%) with missing values. Variable OXYGEN_SATURATION_DIFF_REL has 686 records (35.64%) with missing values. In total, there are 225 variables with missing values
Observations:
That new column is necessary due to the guidelines from the author who informed about not using the data that already indicate that "ICU" == 1
df_admitted = (raw_data.groupby("PATIENT_VISIT_IDENTIFIER")["ICU"].sum()>0).reset_index()*1
df_admitted.columns = ["PATIENT_VISIT_IDENTIFIER", "GO_ICU"]
raw_data = pd.merge(raw_data, df_admitted, on = "PATIENT_VISIT_IDENTIFIER")
# Create dataframe with the same colums as in raw_data
data = pd.DataFrame(columns=raw_data.columns)
# Create empty list which will hold data of each patient (mean)
patient_first_window = []
# for loop which goes through all patient id
for patient_id in range(0, number_of_patients):
# Create new dataframe which contains ONLY one patient data
one_patient_data = raw_data.loc[raw_data['PATIENT_VISIT_IDENTIFIER'] == patient_id]
# Fill up missing data with mean of available data (available data == particaular patient data)
one_patient_data.fillna(one_patient_data.mean(),inplace=True)
# using dataframe.iloc[[0]] => returns a dataframe
patient_first_window.append(one_patient_data.iloc[[0]])
# Create new DataFrame with mean data of each patient cobined into one row (window 0-2)
# Add patient to the new main Dataframe which will contain all patients data
data = pd.concat(patient_first_window)
/var/folders/94/vt0_m_ds4ldfdtdt98l0j_cr0000gn/T/ipykernel_83979/2925090691.py:14: FutureWarning: The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. /var/folders/94/vt0_m_ds4ldfdtdt98l0j_cr0000gn/T/ipykernel_83979/2925090691.py:14: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
data.shape
(385, 232)
data.head()
| PATIENT_VISIT_IDENTIFIER | AGE_ABOVE65 | AGE_PERCENTIL | GENDER | DISEASE GROUPING 1 | DISEASE GROUPING 2 | DISEASE GROUPING 3 | DISEASE GROUPING 4 | DISEASE GROUPING 5 | DISEASE GROUPING 6 | HTN | IMMUNOCOMPROMISED | OTHER | ALBUMIN_MEDIAN | ALBUMIN_MEAN | ALBUMIN_MIN | ALBUMIN_MAX | ALBUMIN_DIFF | BE_ARTERIAL_MEDIAN | BE_ARTERIAL_MEAN | BE_ARTERIAL_MIN | BE_ARTERIAL_MAX | BE_ARTERIAL_DIFF | BE_VENOUS_MEDIAN | BE_VENOUS_MEAN | BE_VENOUS_MIN | BE_VENOUS_MAX | BE_VENOUS_DIFF | BIC_ARTERIAL_MEDIAN | BIC_ARTERIAL_MEAN | BIC_ARTERIAL_MIN | BIC_ARTERIAL_MAX | BIC_ARTERIAL_DIFF | BIC_VENOUS_MEDIAN | BIC_VENOUS_MEAN | BIC_VENOUS_MIN | BIC_VENOUS_MAX | BIC_VENOUS_DIFF | BILLIRUBIN_MEDIAN | BILLIRUBIN_MEAN | BILLIRUBIN_MIN | BILLIRUBIN_MAX | BILLIRUBIN_DIFF | BLAST_MEDIAN | BLAST_MEAN | BLAST_MIN | BLAST_MAX | BLAST_DIFF | CALCIUM_MEDIAN | CALCIUM_MEAN | CALCIUM_MIN | CALCIUM_MAX | CALCIUM_DIFF | CREATININ_MEDIAN | CREATININ_MEAN | CREATININ_MIN | CREATININ_MAX | CREATININ_DIFF | FFA_MEDIAN | FFA_MEAN | FFA_MIN | FFA_MAX | FFA_DIFF | GGT_MEDIAN | GGT_MEAN | GGT_MIN | GGT_MAX | GGT_DIFF | GLUCOSE_MEDIAN | GLUCOSE_MEAN | GLUCOSE_MIN | GLUCOSE_MAX | GLUCOSE_DIFF | HEMATOCRITE_MEDIAN | HEMATOCRITE_MEAN | HEMATOCRITE_MIN | HEMATOCRITE_MAX | HEMATOCRITE_DIFF | HEMOGLOBIN_MEDIAN | HEMOGLOBIN_MEAN | HEMOGLOBIN_MIN | HEMOGLOBIN_MAX | HEMOGLOBIN_DIFF | INR_MEDIAN | INR_MEAN | INR_MIN | INR_MAX | INR_DIFF | LACTATE_MEDIAN | LACTATE_MEAN | LACTATE_MIN | LACTATE_MAX | LACTATE_DIFF | LEUKOCYTES_MEDIAN | LEUKOCYTES_MEAN | LEUKOCYTES_MIN | LEUKOCYTES_MAX | LEUKOCYTES_DIFF | LINFOCITOS_MEDIAN | LINFOCITOS_MEAN | LINFOCITOS_MIN | LINFOCITOS_MAX | LINFOCITOS_DIFF | NEUTROPHILES_MEDIAN | NEUTROPHILES_MEAN | NEUTROPHILES_MIN | NEUTROPHILES_MAX | NEUTROPHILES_DIFF | P02_ARTERIAL_MEDIAN | P02_ARTERIAL_MEAN | P02_ARTERIAL_MIN | P02_ARTERIAL_MAX | P02_ARTERIAL_DIFF | P02_VENOUS_MEDIAN | P02_VENOUS_MEAN | P02_VENOUS_MIN | P02_VENOUS_MAX | P02_VENOUS_DIFF | PC02_ARTERIAL_MEDIAN | PC02_ARTERIAL_MEAN | PC02_ARTERIAL_MIN | PC02_ARTERIAL_MAX | PC02_ARTERIAL_DIFF | PC02_VENOUS_MEDIAN | PC02_VENOUS_MEAN | PC02_VENOUS_MIN | PC02_VENOUS_MAX | PC02_VENOUS_DIFF | PCR_MEDIAN | PCR_MEAN | PCR_MIN | PCR_MAX | PCR_DIFF | PH_ARTERIAL_MEDIAN | PH_ARTERIAL_MEAN | PH_ARTERIAL_MIN | PH_ARTERIAL_MAX | PH_ARTERIAL_DIFF | PH_VENOUS_MEDIAN | PH_VENOUS_MEAN | PH_VENOUS_MIN | PH_VENOUS_MAX | PH_VENOUS_DIFF | PLATELETS_MEDIAN | PLATELETS_MEAN | PLATELETS_MIN | PLATELETS_MAX | PLATELETS_DIFF | POTASSIUM_MEDIAN | POTASSIUM_MEAN | POTASSIUM_MIN | POTASSIUM_MAX | POTASSIUM_DIFF | SAT02_ARTERIAL_MEDIAN | SAT02_ARTERIAL_MEAN | SAT02_ARTERIAL_MIN | SAT02_ARTERIAL_MAX | SAT02_ARTERIAL_DIFF | SAT02_VENOUS_MEDIAN | SAT02_VENOUS_MEAN | SAT02_VENOUS_MIN | SAT02_VENOUS_MAX | SAT02_VENOUS_DIFF | SODIUM_MEDIAN | SODIUM_MEAN | SODIUM_MIN | SODIUM_MAX | SODIUM_DIFF | TGO_MEDIAN | TGO_MEAN | TGO_MIN | TGO_MAX | TGO_DIFF | TGP_MEDIAN | TGP_MEAN | TGP_MIN | TGP_MAX | TGP_DIFF | TTPA_MEDIAN | TTPA_MEAN | TTPA_MIN | TTPA_MAX | TTPA_DIFF | UREA_MEDIAN | UREA_MEAN | UREA_MIN | UREA_MAX | UREA_DIFF | DIMER_MEDIAN | DIMER_MEAN | DIMER_MIN | DIMER_MAX | DIMER_DIFF | BLOODPRESSURE_DIASTOLIC_MEAN | BLOODPRESSURE_SISTOLIC_MEAN | HEART_RATE_MEAN | RESPIRATORY_RATE_MEAN | TEMPERATURE_MEAN | OXYGEN_SATURATION_MEAN | BLOODPRESSURE_DIASTOLIC_MEDIAN | BLOODPRESSURE_SISTOLIC_MEDIAN | HEART_RATE_MEDIAN | RESPIRATORY_RATE_MEDIAN | TEMPERATURE_MEDIAN | OXYGEN_SATURATION_MEDIAN | BLOODPRESSURE_DIASTOLIC_MIN | BLOODPRESSURE_SISTOLIC_MIN | HEART_RATE_MIN | RESPIRATORY_RATE_MIN | TEMPERATURE_MIN | OXYGEN_SATURATION_MIN | BLOODPRESSURE_DIASTOLIC_MAX | BLOODPRESSURE_SISTOLIC_MAX | HEART_RATE_MAX | RESPIRATORY_RATE_MAX | TEMPERATURE_MAX | OXYGEN_SATURATION_MAX | BLOODPRESSURE_DIASTOLIC_DIFF | BLOODPRESSURE_SISTOLIC_DIFF | HEART_RATE_DIFF | RESPIRATORY_RATE_DIFF | TEMPERATURE_DIFF | OXYGEN_SATURATION_DIFF | BLOODPRESSURE_DIASTOLIC_DIFF_REL | BLOODPRESSURE_SISTOLIC_DIFF_REL | HEART_RATE_DIFF_REL | RESPIRATORY_RATE_DIFF_REL | TEMPERATURE_DIFF_REL | OXYGEN_SATURATION_DIFF_REL | WINDOW | ICU | GO_ICU | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 60th | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.302632 | 0.302632 | 0.302632 | 0.302632 | -1.0 | -0.935829 | -0.935829 | -0.935829 | -0.935829 | -1.0 | -0.931937 | -0.931937 | -0.931937 | -0.931937 | -1.0 | -0.317073 | -0.317073 | -0.317073 | -0.317073 | -1.0 | -0.365854 | -0.365854 | -0.365854 | -0.365854 | -1.0 | -0.959009 | -0.959009 | -0.959009 | -0.959009 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | 0.255102 | 0.255102 | 0.255102 | 0.255102 | -1.0 | -0.897381 | -0.897381 | -0.897381 | -0.897381 | -1.0 | -0.800640 | -0.800640 | -0.800640 | -0.800640 | -1.0 | -0.807243 | -0.807243 | -0.807243 | -0.807243 | -1.0 | -0.891993 | -0.891993 | -0.891993 | -0.891993 | -1.0 | -0.115304 | -0.115304 | -0.115304 | -0.115304 | -1.0 | -0.121951 | -0.121951 | -0.121951 | -0.121951 | -1.0 | -0.956085 | -0.956085 | -0.956085 | -0.956085 | -1.0 | 0.018489 | 0.018489 | 0.018489 | 0.018489 | -1.0 | -0.799343 | -0.799343 | -0.799343 | -0.799343 | -1.0 | -0.779046 | -0.779046 | -0.779046 | -0.779046 | -1.0 | -0.868747 | -0.868747 | -0.868747 | -0.868747 | -1.0 | -0.268293 | -0.268293 | -0.268293 | -0.268293 | -1.0 | -0.467456 | -0.467456 | -0.467456 | -0.467456 | -1.0 | -0.827586 | -0.827586 | -0.827586 | -0.827586 | -1.0 | -0.785276 | -0.785276 | -0.785276 | -0.785276 | -1.0 | -0.937618 | -0.937618 | -0.937618 | -0.937618 | -1.0 | 0.404255 | 0.404255 | 0.404255 | 0.404255 | -1.0 | 0.378788 | 0.378788 | 0.378788 | 0.378788 | -1.0 | -0.506008 | -0.506008 | -0.506008 | -0.506008 | -1.0 | -0.592593 | -0.592593 | -0.592593 | -0.592593 | -1.0 | 0.893939 | 0.893939 | 0.893939 | 0.893939 | -1.0 | 0.635802 | 0.635802 | 0.635802 | 0.635802 | -1.0 | 0.057143 | 0.057143 | 0.057143 | 0.057143 | -1.0 | -0.998134 | -0.998134 | -0.998134 | -0.998134 | -1.0 | -0.987424 | -0.987424 | -0.987424 | -0.987424 | -1.0 | -0.836123 | -0.836123 | -0.836123 | -0.836123 | -1.0 | -0.836145 | -0.836145 | -0.836145 | -0.836145 | -1.0 | -0.995837 | -0.995837 | -0.995837 | -0.995837 | -1.0 | 0.086420 | -0.230769 | -0.283019 | -0.593220 | -0.285714 | 0.736842 | 0.086420 | -0.230769 | -0.283019 | -0.586207 | -0.285714 | 0.736842 | 0.237113 | 0.000000 | -0.162393 | -0.500000 | 0.208791 | 0.898990 | -0.247863 | -0.459459 | -0.432836 | -0.636364 | -0.420290 | 0.736842 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | 0-2 | 0 | 1 |
| 5 | 1 | 1 | 90th | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 0.333333 | 0.333333 | 0.333333 | 0.333333 | -1.0 | -0.918004 | -0.918004 | -0.918004 | -0.918004 | -1.0 | -0.972077 | -0.972077 | -0.972077 | -0.972077 | -1.0 | -0.268293 | -0.268293 | -0.268293 | -0.268293 | -1.0 | -0.268293 | -0.268293 | -0.268293 | -0.268293 | -1.0 | -0.948485 | -0.948485 | -0.948485 | -0.948485 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | 0.476190 | 0.476190 | 0.476190 | 0.476190 | -1.0 | -0.652883 | -0.652883 | -0.652883 | -0.652883 | -1.0 | -0.751244 | -0.751244 | -0.751244 | -0.751244 | -1.0 | -0.966511 | -0.966511 | -0.966511 | -0.966511 | -1.0 | -0.891993 | -0.891993 | -0.891993 | -0.891993 | -1.0 | -0.519217 | -0.519217 | -0.519217 | -0.519217 | -1.0 | -0.538064 | -0.538064 | -0.538064 | -0.538064 | -1.0 | -0.973233 | -0.973233 | -0.973233 | -0.973233 | -1.0 | -0.983893 | -0.983893 | -0.983893 | -0.983893 | -1.0 | -0.654049 | -0.654049 | -0.654049 | -0.654049 | -1.0 | -0.730290 | -0.730290 | -0.730290 | -0.730290 | -1.0 | -0.715924 | -0.715924 | -0.715924 | -0.715924 | -1.0 | 0.052846 | 0.052846 | 0.052846 | 0.052846 | -1.0 | -0.092702 | -0.092702 | -0.092702 | -0.092702 | -1.0 | -0.760920 | -0.760920 | -0.760920 | -0.760920 | -1.0 | -0.664622 | -0.664622 | -0.664622 | -0.664622 | -1.0 | -0.921991 | -0.921991 | -0.921991 | -0.921991 | -1.0 | 0.248227 | 0.248227 | 0.248227 | 0.248227 | -1.0 | 0.222222 | 0.222222 | 0.222222 | 0.222222 | -1.0 | -0.388518 | -0.388518 | -0.388518 | -0.388518 | -1.0 | -0.518519 | -0.518519 | -0.518519 | -0.518519 | -1.0 | 0.969697 | 0.969697 | 0.969697 | 0.969697 | -1.0 | 0.641975 | 0.641975 | 0.641975 | 0.641975 | -1.0 | -0.168254 | -0.168254 | -0.168254 | -0.168254 | -1.0 | -0.995957 | -0.995957 | -0.995957 | -0.995957 | -1.0 | -0.989075 | -0.989075 | -0.989075 | -0.989075 | -1.0 | -0.769041 | -0.769041 | -0.769041 | -0.769041 | -1.0 | -0.633735 | -0.633735 | -0.633735 | -0.633735 | -1.0 | -0.966593 | -0.966593 | -0.966593 | -0.966593 | -1.0 | -0.283951 | -0.046154 | 0.188679 | 0.830508 | -0.107143 | 1.000000 | -0.283951 | -0.046154 | 0.188679 | 0.862069 | -0.107143 | 1.000000 | -0.072165 | 0.150000 | 0.264957 | 1.000000 | 0.318681 | 1.000000 | -0.504274 | -0.329730 | -0.059701 | 0.636364 | -0.275362 | 1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | 0-2 | 1 | 1 |
| 10 | 2 | 0 | 10th | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.605263 | 0.605263 | 0.605263 | 0.605263 | -1.0 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.0 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.0 | -0.317073 | -0.317073 | -0.317073 | -0.317073 | -1.0 | -0.317073 | -0.317073 | -0.317073 | -0.317073 | -1.0 | -0.938950 | -0.938950 | -0.938950 | -0.938950 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | 0.357143 | 0.357143 | 0.357143 | 0.357143 | -1.0 | -0.912243 | -0.912243 | -0.912243 | -0.912243 | -1.0 | -0.742004 | -0.742004 | -0.742004 | -0.742004 | -1.0 | -0.958528 | -0.958528 | -0.958528 | -0.958528 | -1.0 | -0.780261 | -0.780261 | -0.780261 | -0.780261 | -1.0 | 0.144654 | 0.144654 | 0.144654 | 0.144654 | -1.0 | 0.158537 | 0.158537 | 0.158537 | 0.158537 | -1.0 | -0.959849 | -0.959849 | -0.959849 | -0.959849 | -1.0 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | -1.0 | -0.382773 | -0.382773 | -0.382773 | -0.382773 | -1.0 | -0.908714 | -0.908714 | -0.908714 | -0.908714 | -1.0 | -0.412965 | -0.412965 | -0.412965 | -0.412965 | -1.0 | -0.170732 | -0.170732 | -0.170732 | -0.170732 | -1.0 | -0.704142 | -0.704142 | -0.704142 | -0.704142 | -1.0 | -0.779310 | -0.779310 | -0.779310 | -0.779310 | -1.0 | -0.754601 | -0.754601 | -0.754601 | -0.754601 | -1.0 | -0.939887 | -0.939887 | -0.939887 | -0.939887 | -1.0 | 0.234043 | 0.234043 | 0.234043 | 0.234043 | -1.0 | 0.363636 | 0.363636 | 0.363636 | 0.363636 | -1.0 | -0.399199 | -0.399199 | -0.399199 | -0.399199 | -1.0 | -0.703704 | -0.703704 | -0.703704 | -0.703704 | -1.0 | 0.939394 | 0.939394 | 0.939394 | 0.939394 | -1.0 | 0.345679 | 0.345679 | 0.345679 | 0.345679 | -1.0 | 0.085714 | 0.085714 | 0.085714 | 0.085714 | -1.0 | -0.995428 | -0.995428 | -0.995428 | -0.995428 | -1.0 | -0.986662 | -0.986662 | -0.986662 | -0.986662 | -1.0 | -0.846633 | -0.846633 | -0.846633 | -0.846633 | -1.0 | -0.836145 | -0.836145 | -0.836145 | -0.836145 | -1.0 | -0.978029 | -0.978029 | -0.978029 | -0.978029 | -1.0 | -0.427812 | -0.688640 | -0.097048 | -0.636692 | 0.207217 | 0.844055 | -0.423868 | -0.733333 | -0.100629 | -0.586207 | 0.232143 | 0.850877 | -0.443299 | -0.541667 | -0.259259 | -0.666667 | 0.373626 | 0.878788 | -0.418803 | -0.535135 | -0.049751 | -0.575758 | 0.101449 | 0.982456 | -0.600000 | -0.554192 | -0.521628 | -0.803922 | -0.750000 | -0.885522 | -0.595604 | -0.419448 | -0.681860 | -0.792832 | -0.752732 | -0.887561 | 0-2 | 0 | 1 |
| 15 | 3 | 0 | 40th | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | -0.263158 | -0.263158 | -0.263158 | -0.263158 | -1.0 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.0 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.0 | -0.317073 | -0.317073 | -0.317073 | -0.317073 | -1.0 | -0.317073 | -0.317073 | -0.317073 | -0.317073 | -1.0 | -0.972789 | -0.972789 | -0.972789 | -0.972789 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | 0.326531 | 0.326531 | 0.326531 | 0.326531 | -1.0 | -0.968861 | -0.968861 | -0.968861 | -0.968861 | -1.0 | -0.194030 | -0.194030 | -0.194030 | -0.194030 | -1.0 | -0.316589 | -0.316589 | -0.316589 | -0.316589 | -1.0 | -0.891993 | -0.891993 | -0.891993 | -0.891993 | -1.0 | -0.203354 | -0.203354 | -0.203354 | -0.203354 | -1.0 | -0.219512 | -0.219512 | -0.219512 | -0.219512 | -1.0 | -0.959849 | -0.959849 | -0.959849 | -0.959849 | -1.0 | -0.828421 | -0.828421 | -0.828421 | -0.828421 | -1.0 | -0.729239 | -0.729239 | -0.729239 | -0.729239 | -1.0 | -0.836100 | -0.836100 | -0.836100 | -0.836100 | -1.0 | -0.784714 | -0.784714 | -0.784714 | -0.784714 | -1.0 | -0.170732 | -0.170732 | -0.170732 | -0.170732 | -1.0 | -0.633136 | -0.633136 | -0.633136 | -0.633136 | -1.0 | -0.779310 | -0.779310 | -0.779310 | -0.779310 | -1.0 | -0.779141 | -0.779141 | -0.779141 | -0.779141 | -1.0 | -0.503592 | -0.503592 | -0.503592 | -0.503592 | -1.0 | 0.234043 | 0.234043 | 0.234043 | 0.234043 | -1.0 | 0.363636 | 0.363636 | 0.363636 | 0.363636 | -1.0 | -0.564753 | -0.564753 | -0.564753 | -0.564753 | -1.0 | -0.777778 | -0.777778 | -0.777778 | -0.777778 | -1.0 | 0.939394 | 0.939394 | 0.939394 | 0.939394 | -1.0 | 0.580247 | 0.580247 | 0.580247 | 0.580247 | -1.0 | 0.200000 | 0.200000 | 0.200000 | 0.200000 | -1.0 | -0.989549 | -0.989549 | -0.989549 | -0.989549 | -1.0 | -0.956555 | -0.956555 | -0.956555 | -0.956555 | -1.0 | -0.846633 | -0.846633 | -0.846633 | -0.846633 | -1.0 | -0.937349 | -0.937349 | -0.937349 | -0.937349 | -1.0 | -0.978029 | -0.978029 | -0.978029 | -0.978029 | -1.0 | -0.132620 | -0.484650 | -0.448553 | -0.506215 | -0.119762 | 0.652398 | -0.102881 | -0.482051 | -0.459119 | -0.494253 | -0.142857 | 0.666667 | -0.127148 | -0.329167 | -0.435897 | -0.547619 | 0.216117 | 0.629630 | -0.247863 | -0.506306 | -0.263682 | -0.454545 | -0.024155 | 0.754386 | -0.692754 | -0.730061 | -0.582697 | -0.784314 | -0.682540 | -0.723906 | -0.769565 | -0.685906 | -0.689698 | -0.776583 | -0.682540 | -0.724145 | 0-2 | 0 | 0 |
| 20 | 4 | 0 | 10th | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.605263 | 0.605263 | 0.605263 | 0.605263 | -1.0 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.0 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.0 | -0.317073 | -0.317073 | -0.317073 | -0.317073 | -1.0 | -0.317073 | -0.317073 | -0.317073 | -0.317073 | -1.0 | -0.935113 | -0.935113 | -0.935113 | -0.935113 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | 0.357143 | 0.357143 | 0.357143 | 0.357143 | -1.0 | -0.913659 | -0.913659 | -0.913659 | -0.913659 | -1.0 | -0.829424 | -0.829424 | -0.829424 | -0.829424 | -1.0 | -0.938084 | -0.938084 | -0.938084 | -0.938084 | -1.0 | -0.851024 | -0.851024 | -0.851024 | -0.851024 | -1.0 | 0.358491 | 0.358491 | 0.358491 | 0.358491 | -1.0 | 0.304878 | 0.304878 | 0.304878 | 0.304878 | -1.0 | -0.959849 | -0.959849 | -0.959849 | -0.959849 | -1.0 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | -1.0 | -0.702202 | -0.702202 | -0.702202 | -0.702202 | -1.0 | -0.641079 | -0.641079 | -0.641079 | -0.641079 | -1.0 | -0.812725 | -0.812725 | -0.812725 | -0.812725 | -1.0 | -0.170732 | -0.170732 | -0.170732 | -0.170732 | -1.0 | -0.704142 | -0.704142 | -0.704142 | -0.704142 | -1.0 | -0.779310 | -0.779310 | -0.779310 | -0.779310 | -1.0 | -0.754601 | -0.754601 | -0.754601 | -0.754601 | -1.0 | -0.990926 | -0.990926 | -0.990926 | -0.990926 | -1.0 | 0.234043 | 0.234043 | 0.234043 | 0.234043 | -1.0 | 0.363636 | 0.363636 | 0.363636 | 0.363636 | -1.0 | -0.457944 | -0.457944 | -0.457944 | -0.457944 | -1.0 | -0.592593 | -0.592593 | -0.592593 | -0.592593 | -1.0 | 0.939394 | 0.939394 | 0.939394 | 0.939394 | -1.0 | 0.345679 | 0.345679 | 0.345679 | 0.345679 | -1.0 | 0.142857 | 0.142857 | 0.142857 | 0.142857 | -1.0 | -0.998507 | -0.998507 | -0.998507 | -0.998507 | -1.0 | -0.991235 | -0.991235 | -0.991235 | -0.991235 | -1.0 | -0.846633 | -0.846633 | -0.846633 | -0.846633 | -1.0 | -0.903614 | -0.903614 | -0.903614 | -0.903614 | -1.0 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.0 | 0.236838 | -0.101680 | 0.144491 | -0.539451 | 0.189142 | 0.852390 | 0.251029 | -0.102564 | 0.160377 | -0.540230 | 0.226190 | 0.833333 | 0.237113 | 0.000000 | 0.048433 | -0.500000 | 0.362637 | 0.925926 | -0.076923 | -0.293694 | 0.019900 | -0.535354 | 0.072464 | 0.912281 | -0.826087 | -0.811861 | -0.725191 | -0.901961 | -0.761905 | -0.959596 | -0.884058 | -0.826611 | -0.839287 | -0.896057 | -0.766042 | -0.960291 | 0-2 | 0 | 0 |
Important Info:
From now, AMOUNT OF PATIENTS = AMOUNT OF ROWS
#drop rows with ICU == 1 ie drop data when the target variable is present, as stipulated by dataset author
data = data[data.ICU == 0].reset_index(drop = True)
data.shape
(353, 232)
After removing patients who went to ICU straight away, we have 353 patients out of 385
#drop unnecessary columns
data = data.drop(["WINDOW", "ICU"],axis = 1)
I am assuming that mean of values which belong to the patient that eventually go to ICU, will differ from mean values which belong to patient who will not go to ICU
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 353 entries, 0 to 352 Columns: 230 entries, PATIENT_VISIT_IDENTIFIER to GO_ICU dtypes: float64(225), int64(4), object(1) memory usage: 634.4+ KB
4 objects out of 6 should be an int64. 2 columns which are objects are: 'AGE_PERCENTIL' and 'WINDOW'
objects = data.select_dtypes(object).columns
print(objects)
Index(['AGE_PERCENTIL'], dtype='object')
data['PATIENT_VISIT_IDENTIFIER'] = data['PATIENT_VISIT_IDENTIFIER'].astype('int64')
data['AGE_ABOVE65'] = data['AGE_ABOVE65'].astype('int64')
data['GENDER'] = data['GENDER'].astype('int64')
raw_data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1925 entries, 0 to 1924 Columns: 232 entries, PATIENT_VISIT_IDENTIFIER to GO_ICU dtypes: float64(225), int64(5), object(2) memory usage: 3.4+ MB
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 353 entries, 0 to 352 Columns: 230 entries, PATIENT_VISIT_IDENTIFIER to GO_ICU dtypes: float64(225), int64(4), object(1) memory usage: 634.4+ KB
def check_for_missing_data_in_columns(dataset_name):
print('NaN values =', dataset_name.isnull().sum().sum())
print("""""")
vars_with_missing = []
for feature in dataset_name.columns:
missings = dataset_name[feature].isna().sum()
if missings > 0 :
vars_with_missing.append(feature)
missings_perc = missings / dataset_name.shape[0]
print('Column {} has {} records ({:.2%}) with missing values.'.format(feature, missings, missings_perc))
print('In total, there are {} columns with missing values'.format(len(vars_with_missing)))
check_for_missing_data_in_columns(data)
NaN values = 405 Column DISEASE GROUPING 1 has 1 records (0.28%) with missing values. Column DISEASE GROUPING 2 has 1 records (0.28%) with missing values. Column DISEASE GROUPING 3 has 1 records (0.28%) with missing values. Column DISEASE GROUPING 4 has 1 records (0.28%) with missing values. Column DISEASE GROUPING 5 has 1 records (0.28%) with missing values. Column DISEASE GROUPING 6 has 1 records (0.28%) with missing values. Column HTN has 1 records (0.28%) with missing values. Column IMMUNOCOMPROMISED has 1 records (0.28%) with missing values. Column OTHER has 1 records (0.28%) with missing values. Column ALBUMIN_MEDIAN has 2 records (0.57%) with missing values. Column ALBUMIN_MEAN has 2 records (0.57%) with missing values. Column ALBUMIN_MIN has 2 records (0.57%) with missing values. Column ALBUMIN_MAX has 2 records (0.57%) with missing values. Column ALBUMIN_DIFF has 2 records (0.57%) with missing values. Column BE_ARTERIAL_MEDIAN has 2 records (0.57%) with missing values. Column BE_ARTERIAL_MEAN has 2 records (0.57%) with missing values. Column BE_ARTERIAL_MIN has 2 records (0.57%) with missing values. Column BE_ARTERIAL_MAX has 2 records (0.57%) with missing values. Column BE_ARTERIAL_DIFF has 2 records (0.57%) with missing values. Column BE_VENOUS_MEDIAN has 2 records (0.57%) with missing values. Column BE_VENOUS_MEAN has 2 records (0.57%) with missing values. Column BE_VENOUS_MIN has 2 records (0.57%) with missing values. Column BE_VENOUS_MAX has 2 records (0.57%) with missing values. Column BE_VENOUS_DIFF has 2 records (0.57%) with missing values. Column BIC_ARTERIAL_MEDIAN has 2 records (0.57%) with missing values. Column BIC_ARTERIAL_MEAN has 2 records (0.57%) with missing values. Column BIC_ARTERIAL_MIN has 2 records (0.57%) with missing values. Column BIC_ARTERIAL_MAX has 2 records (0.57%) with missing values. Column BIC_ARTERIAL_DIFF has 2 records (0.57%) with missing values. Column BIC_VENOUS_MEDIAN has 2 records (0.57%) with missing values. Column BIC_VENOUS_MEAN has 2 records (0.57%) with missing values. Column BIC_VENOUS_MIN has 2 records (0.57%) with missing values. Column BIC_VENOUS_MAX has 2 records (0.57%) with missing values. Column BIC_VENOUS_DIFF has 2 records (0.57%) with missing values. Column BILLIRUBIN_MEDIAN has 2 records (0.57%) with missing values. Column BILLIRUBIN_MEAN has 2 records (0.57%) with missing values. Column BILLIRUBIN_MIN has 2 records (0.57%) with missing values. Column BILLIRUBIN_MAX has 2 records (0.57%) with missing values. Column BILLIRUBIN_DIFF has 2 records (0.57%) with missing values. Column BLAST_MEDIAN has 2 records (0.57%) with missing values. Column BLAST_MEAN has 2 records (0.57%) with missing values. Column BLAST_MIN has 2 records (0.57%) with missing values. Column BLAST_MAX has 2 records (0.57%) with missing values. Column BLAST_DIFF has 2 records (0.57%) with missing values. Column CALCIUM_MEDIAN has 2 records (0.57%) with missing values. Column CALCIUM_MEAN has 2 records (0.57%) with missing values. Column CALCIUM_MIN has 2 records (0.57%) with missing values. Column CALCIUM_MAX has 2 records (0.57%) with missing values. Column CALCIUM_DIFF has 2 records (0.57%) with missing values. Column CREATININ_MEDIAN has 2 records (0.57%) with missing values. Column CREATININ_MEAN has 2 records (0.57%) with missing values. Column CREATININ_MIN has 2 records (0.57%) with missing values. Column CREATININ_MAX has 2 records (0.57%) with missing values. Column CREATININ_DIFF has 2 records (0.57%) with missing values. Column FFA_MEDIAN has 2 records (0.57%) with missing values. Column FFA_MEAN has 2 records (0.57%) with missing values. Column FFA_MIN has 2 records (0.57%) with missing values. Column FFA_MAX has 2 records (0.57%) with missing values. Column FFA_DIFF has 2 records (0.57%) with missing values. Column GGT_MEDIAN has 2 records (0.57%) with missing values. Column GGT_MEAN has 2 records (0.57%) with missing values. Column GGT_MIN has 2 records (0.57%) with missing values. Column GGT_MAX has 2 records (0.57%) with missing values. Column GGT_DIFF has 2 records (0.57%) with missing values. Column GLUCOSE_MEDIAN has 2 records (0.57%) with missing values. Column GLUCOSE_MEAN has 2 records (0.57%) with missing values. Column GLUCOSE_MIN has 2 records (0.57%) with missing values. Column GLUCOSE_MAX has 2 records (0.57%) with missing values. Column GLUCOSE_DIFF has 2 records (0.57%) with missing values. Column HEMATOCRITE_MEDIAN has 2 records (0.57%) with missing values. Column HEMATOCRITE_MEAN has 2 records (0.57%) with missing values. Column HEMATOCRITE_MIN has 2 records (0.57%) with missing values. Column HEMATOCRITE_MAX has 2 records (0.57%) with missing values. Column HEMATOCRITE_DIFF has 2 records (0.57%) with missing values. Column HEMOGLOBIN_MEDIAN has 2 records (0.57%) with missing values. Column HEMOGLOBIN_MEAN has 2 records (0.57%) with missing values. Column HEMOGLOBIN_MIN has 2 records (0.57%) with missing values. Column HEMOGLOBIN_MAX has 2 records (0.57%) with missing values. Column HEMOGLOBIN_DIFF has 2 records (0.57%) with missing values. Column INR_MEDIAN has 2 records (0.57%) with missing values. Column INR_MEAN has 2 records (0.57%) with missing values. Column INR_MIN has 2 records (0.57%) with missing values. Column INR_MAX has 2 records (0.57%) with missing values. Column INR_DIFF has 2 records (0.57%) with missing values. Column LACTATE_MEDIAN has 2 records (0.57%) with missing values. Column LACTATE_MEAN has 2 records (0.57%) with missing values. Column LACTATE_MIN has 2 records (0.57%) with missing values. Column LACTATE_MAX has 2 records (0.57%) with missing values. Column LACTATE_DIFF has 2 records (0.57%) with missing values. Column LEUKOCYTES_MEDIAN has 2 records (0.57%) with missing values. Column LEUKOCYTES_MEAN has 2 records (0.57%) with missing values. Column LEUKOCYTES_MIN has 2 records (0.57%) with missing values. Column LEUKOCYTES_MAX has 2 records (0.57%) with missing values. Column LEUKOCYTES_DIFF has 2 records (0.57%) with missing values. Column LINFOCITOS_MEDIAN has 2 records (0.57%) with missing values. Column LINFOCITOS_MEAN has 2 records (0.57%) with missing values. Column LINFOCITOS_MIN has 2 records (0.57%) with missing values. Column LINFOCITOS_MAX has 2 records (0.57%) with missing values. Column LINFOCITOS_DIFF has 2 records (0.57%) with missing values. Column NEUTROPHILES_MEDIAN has 2 records (0.57%) with missing values. Column NEUTROPHILES_MEAN has 2 records (0.57%) with missing values. Column NEUTROPHILES_MIN has 2 records (0.57%) with missing values. Column NEUTROPHILES_MAX has 2 records (0.57%) with missing values. Column NEUTROPHILES_DIFF has 2 records (0.57%) with missing values. Column P02_ARTERIAL_MEDIAN has 2 records (0.57%) with missing values. Column P02_ARTERIAL_MEAN has 2 records (0.57%) with missing values. Column P02_ARTERIAL_MIN has 2 records (0.57%) with missing values. Column P02_ARTERIAL_MAX has 2 records (0.57%) with missing values. Column P02_ARTERIAL_DIFF has 2 records (0.57%) with missing values. Column P02_VENOUS_MEDIAN has 2 records (0.57%) with missing values. Column P02_VENOUS_MEAN has 2 records (0.57%) with missing values. Column P02_VENOUS_MIN has 2 records (0.57%) with missing values. Column P02_VENOUS_MAX has 2 records (0.57%) with missing values. Column P02_VENOUS_DIFF has 2 records (0.57%) with missing values. Column PC02_ARTERIAL_MEDIAN has 2 records (0.57%) with missing values. Column PC02_ARTERIAL_MEAN has 2 records (0.57%) with missing values. Column PC02_ARTERIAL_MIN has 2 records (0.57%) with missing values. Column PC02_ARTERIAL_MAX has 2 records (0.57%) with missing values. Column PC02_ARTERIAL_DIFF has 2 records (0.57%) with missing values. Column PC02_VENOUS_MEDIAN has 2 records (0.57%) with missing values. Column PC02_VENOUS_MEAN has 2 records (0.57%) with missing values. Column PC02_VENOUS_MIN has 2 records (0.57%) with missing values. Column PC02_VENOUS_MAX has 2 records (0.57%) with missing values. Column PC02_VENOUS_DIFF has 2 records (0.57%) with missing values. Column PCR_MEDIAN has 2 records (0.57%) with missing values. Column PCR_MEAN has 2 records (0.57%) with missing values. Column PCR_MIN has 2 records (0.57%) with missing values. Column PCR_MAX has 2 records (0.57%) with missing values. Column PCR_DIFF has 2 records (0.57%) with missing values. Column PH_ARTERIAL_MEDIAN has 2 records (0.57%) with missing values. Column PH_ARTERIAL_MEAN has 2 records (0.57%) with missing values. Column PH_ARTERIAL_MIN has 2 records (0.57%) with missing values. Column PH_ARTERIAL_MAX has 2 records (0.57%) with missing values. Column PH_ARTERIAL_DIFF has 2 records (0.57%) with missing values. Column PH_VENOUS_MEDIAN has 2 records (0.57%) with missing values. Column PH_VENOUS_MEAN has 2 records (0.57%) with missing values. Column PH_VENOUS_MIN has 2 records (0.57%) with missing values. Column PH_VENOUS_MAX has 2 records (0.57%) with missing values. Column PH_VENOUS_DIFF has 2 records (0.57%) with missing values. Column PLATELETS_MEDIAN has 2 records (0.57%) with missing values. Column PLATELETS_MEAN has 2 records (0.57%) with missing values. Column PLATELETS_MIN has 2 records (0.57%) with missing values. Column PLATELETS_MAX has 2 records (0.57%) with missing values. Column PLATELETS_DIFF has 2 records (0.57%) with missing values. Column POTASSIUM_MEDIAN has 2 records (0.57%) with missing values. Column POTASSIUM_MEAN has 2 records (0.57%) with missing values. Column POTASSIUM_MIN has 2 records (0.57%) with missing values. Column POTASSIUM_MAX has 2 records (0.57%) with missing values. Column POTASSIUM_DIFF has 2 records (0.57%) with missing values. Column SAT02_ARTERIAL_MEDIAN has 2 records (0.57%) with missing values. Column SAT02_ARTERIAL_MEAN has 2 records (0.57%) with missing values. Column SAT02_ARTERIAL_MIN has 2 records (0.57%) with missing values. Column SAT02_ARTERIAL_MAX has 2 records (0.57%) with missing values. Column SAT02_ARTERIAL_DIFF has 2 records (0.57%) with missing values. Column SAT02_VENOUS_MEDIAN has 2 records (0.57%) with missing values. Column SAT02_VENOUS_MEAN has 2 records (0.57%) with missing values. Column SAT02_VENOUS_MIN has 2 records (0.57%) with missing values. Column SAT02_VENOUS_MAX has 2 records (0.57%) with missing values. Column SAT02_VENOUS_DIFF has 2 records (0.57%) with missing values. Column SODIUM_MEDIAN has 2 records (0.57%) with missing values. Column SODIUM_MEAN has 2 records (0.57%) with missing values. Column SODIUM_MIN has 2 records (0.57%) with missing values. Column SODIUM_MAX has 2 records (0.57%) with missing values. Column SODIUM_DIFF has 2 records (0.57%) with missing values. Column TGO_MEDIAN has 2 records (0.57%) with missing values. Column TGO_MEAN has 2 records (0.57%) with missing values. Column TGO_MIN has 2 records (0.57%) with missing values. Column TGO_MAX has 2 records (0.57%) with missing values. Column TGO_DIFF has 2 records (0.57%) with missing values. Column TGP_MEDIAN has 2 records (0.57%) with missing values. Column TGP_MEAN has 2 records (0.57%) with missing values. Column TGP_MIN has 2 records (0.57%) with missing values. Column TGP_MAX has 2 records (0.57%) with missing values. Column TGP_DIFF has 2 records (0.57%) with missing values. Column TTPA_MEDIAN has 2 records (0.57%) with missing values. Column TTPA_MEAN has 2 records (0.57%) with missing values. Column TTPA_MIN has 2 records (0.57%) with missing values. Column TTPA_MAX has 2 records (0.57%) with missing values. Column TTPA_DIFF has 2 records (0.57%) with missing values. Column UREA_MEDIAN has 2 records (0.57%) with missing values. Column UREA_MEAN has 2 records (0.57%) with missing values. Column UREA_MIN has 2 records (0.57%) with missing values. Column UREA_MAX has 2 records (0.57%) with missing values. Column UREA_DIFF has 2 records (0.57%) with missing values. Column DIMER_MEDIAN has 2 records (0.57%) with missing values. Column DIMER_MEAN has 2 records (0.57%) with missing values. Column DIMER_MIN has 2 records (0.57%) with missing values. Column DIMER_MAX has 2 records (0.57%) with missing values. Column DIMER_DIFF has 2 records (0.57%) with missing values. Column BLOODPRESSURE_DIASTOLIC_MEAN has 1 records (0.28%) with missing values. Column BLOODPRESSURE_SISTOLIC_MEAN has 1 records (0.28%) with missing values. Column HEART_RATE_MEAN has 1 records (0.28%) with missing values. Column RESPIRATORY_RATE_MEAN has 1 records (0.28%) with missing values. Column TEMPERATURE_MEAN has 1 records (0.28%) with missing values. Column OXYGEN_SATURATION_MEAN has 1 records (0.28%) with missing values. Column BLOODPRESSURE_DIASTOLIC_MEDIAN has 1 records (0.28%) with missing values. Column BLOODPRESSURE_SISTOLIC_MEDIAN has 1 records (0.28%) with missing values. Column HEART_RATE_MEDIAN has 1 records (0.28%) with missing values. Column RESPIRATORY_RATE_MEDIAN has 1 records (0.28%) with missing values. Column TEMPERATURE_MEDIAN has 1 records (0.28%) with missing values. Column OXYGEN_SATURATION_MEDIAN has 1 records (0.28%) with missing values. Column BLOODPRESSURE_DIASTOLIC_MIN has 1 records (0.28%) with missing values. Column BLOODPRESSURE_SISTOLIC_MIN has 1 records (0.28%) with missing values. Column HEART_RATE_MIN has 1 records (0.28%) with missing values. Column RESPIRATORY_RATE_MIN has 1 records (0.28%) with missing values. Column TEMPERATURE_MIN has 1 records (0.28%) with missing values. Column OXYGEN_SATURATION_MIN has 1 records (0.28%) with missing values. Column BLOODPRESSURE_DIASTOLIC_MAX has 1 records (0.28%) with missing values. Column BLOODPRESSURE_SISTOLIC_MAX has 1 records (0.28%) with missing values. Column HEART_RATE_MAX has 1 records (0.28%) with missing values. Column RESPIRATORY_RATE_MAX has 1 records (0.28%) with missing values. Column TEMPERATURE_MAX has 1 records (0.28%) with missing values. Column OXYGEN_SATURATION_MAX has 1 records (0.28%) with missing values. Column BLOODPRESSURE_DIASTOLIC_DIFF has 1 records (0.28%) with missing values. Column BLOODPRESSURE_SISTOLIC_DIFF has 1 records (0.28%) with missing values. Column HEART_RATE_DIFF has 1 records (0.28%) with missing values. Column RESPIRATORY_RATE_DIFF has 1 records (0.28%) with missing values. Column TEMPERATURE_DIFF has 1 records (0.28%) with missing values. Column OXYGEN_SATURATION_DIFF has 1 records (0.28%) with missing values. Column BLOODPRESSURE_DIASTOLIC_DIFF_REL has 1 records (0.28%) with missing values. Column BLOODPRESSURE_SISTOLIC_DIFF_REL has 1 records (0.28%) with missing values. Column HEART_RATE_DIFF_REL has 1 records (0.28%) with missing values. Column RESPIRATORY_RATE_DIFF_REL has 1 records (0.28%) with missing values. Column TEMPERATURE_DIFF_REL has 1 records (0.28%) with missing values. Column OXYGEN_SATURATION_DIFF_REL has 1 records (0.28%) with missing values. In total, there are 225 columns with missing values
Observation:
empty_data = data[data.isna().any(axis=1)]
display(empty_data)
| PATIENT_VISIT_IDENTIFIER | AGE_ABOVE65 | AGE_PERCENTIL | GENDER | DISEASE GROUPING 1 | DISEASE GROUPING 2 | DISEASE GROUPING 3 | DISEASE GROUPING 4 | DISEASE GROUPING 5 | DISEASE GROUPING 6 | HTN | IMMUNOCOMPROMISED | OTHER | ALBUMIN_MEDIAN | ALBUMIN_MEAN | ALBUMIN_MIN | ALBUMIN_MAX | ALBUMIN_DIFF | BE_ARTERIAL_MEDIAN | BE_ARTERIAL_MEAN | BE_ARTERIAL_MIN | BE_ARTERIAL_MAX | BE_ARTERIAL_DIFF | BE_VENOUS_MEDIAN | BE_VENOUS_MEAN | BE_VENOUS_MIN | BE_VENOUS_MAX | BE_VENOUS_DIFF | BIC_ARTERIAL_MEDIAN | BIC_ARTERIAL_MEAN | BIC_ARTERIAL_MIN | BIC_ARTERIAL_MAX | BIC_ARTERIAL_DIFF | BIC_VENOUS_MEDIAN | BIC_VENOUS_MEAN | BIC_VENOUS_MIN | BIC_VENOUS_MAX | BIC_VENOUS_DIFF | BILLIRUBIN_MEDIAN | BILLIRUBIN_MEAN | BILLIRUBIN_MIN | BILLIRUBIN_MAX | BILLIRUBIN_DIFF | BLAST_MEDIAN | BLAST_MEAN | BLAST_MIN | BLAST_MAX | BLAST_DIFF | CALCIUM_MEDIAN | CALCIUM_MEAN | CALCIUM_MIN | CALCIUM_MAX | CALCIUM_DIFF | CREATININ_MEDIAN | CREATININ_MEAN | CREATININ_MIN | CREATININ_MAX | CREATININ_DIFF | FFA_MEDIAN | FFA_MEAN | FFA_MIN | FFA_MAX | FFA_DIFF | GGT_MEDIAN | GGT_MEAN | GGT_MIN | GGT_MAX | GGT_DIFF | GLUCOSE_MEDIAN | GLUCOSE_MEAN | GLUCOSE_MIN | GLUCOSE_MAX | GLUCOSE_DIFF | HEMATOCRITE_MEDIAN | HEMATOCRITE_MEAN | HEMATOCRITE_MIN | HEMATOCRITE_MAX | HEMATOCRITE_DIFF | HEMOGLOBIN_MEDIAN | HEMOGLOBIN_MEAN | HEMOGLOBIN_MIN | HEMOGLOBIN_MAX | HEMOGLOBIN_DIFF | INR_MEDIAN | INR_MEAN | INR_MIN | INR_MAX | INR_DIFF | LACTATE_MEDIAN | LACTATE_MEAN | LACTATE_MIN | LACTATE_MAX | LACTATE_DIFF | LEUKOCYTES_MEDIAN | LEUKOCYTES_MEAN | LEUKOCYTES_MIN | LEUKOCYTES_MAX | LEUKOCYTES_DIFF | LINFOCITOS_MEDIAN | LINFOCITOS_MEAN | LINFOCITOS_MIN | LINFOCITOS_MAX | LINFOCITOS_DIFF | NEUTROPHILES_MEDIAN | NEUTROPHILES_MEAN | NEUTROPHILES_MIN | NEUTROPHILES_MAX | NEUTROPHILES_DIFF | P02_ARTERIAL_MEDIAN | P02_ARTERIAL_MEAN | P02_ARTERIAL_MIN | P02_ARTERIAL_MAX | P02_ARTERIAL_DIFF | P02_VENOUS_MEDIAN | P02_VENOUS_MEAN | P02_VENOUS_MIN | P02_VENOUS_MAX | P02_VENOUS_DIFF | PC02_ARTERIAL_MEDIAN | PC02_ARTERIAL_MEAN | PC02_ARTERIAL_MIN | PC02_ARTERIAL_MAX | PC02_ARTERIAL_DIFF | PC02_VENOUS_MEDIAN | PC02_VENOUS_MEAN | PC02_VENOUS_MIN | PC02_VENOUS_MAX | PC02_VENOUS_DIFF | PCR_MEDIAN | PCR_MEAN | PCR_MIN | PCR_MAX | PCR_DIFF | PH_ARTERIAL_MEDIAN | PH_ARTERIAL_MEAN | PH_ARTERIAL_MIN | PH_ARTERIAL_MAX | PH_ARTERIAL_DIFF | PH_VENOUS_MEDIAN | PH_VENOUS_MEAN | PH_VENOUS_MIN | PH_VENOUS_MAX | PH_VENOUS_DIFF | PLATELETS_MEDIAN | PLATELETS_MEAN | PLATELETS_MIN | PLATELETS_MAX | PLATELETS_DIFF | POTASSIUM_MEDIAN | POTASSIUM_MEAN | POTASSIUM_MIN | POTASSIUM_MAX | POTASSIUM_DIFF | SAT02_ARTERIAL_MEDIAN | SAT02_ARTERIAL_MEAN | SAT02_ARTERIAL_MIN | SAT02_ARTERIAL_MAX | SAT02_ARTERIAL_DIFF | SAT02_VENOUS_MEDIAN | SAT02_VENOUS_MEAN | SAT02_VENOUS_MIN | SAT02_VENOUS_MAX | SAT02_VENOUS_DIFF | SODIUM_MEDIAN | SODIUM_MEAN | SODIUM_MIN | SODIUM_MAX | SODIUM_DIFF | TGO_MEDIAN | TGO_MEAN | TGO_MIN | TGO_MAX | TGO_DIFF | TGP_MEDIAN | TGP_MEAN | TGP_MIN | TGP_MAX | TGP_DIFF | TTPA_MEDIAN | TTPA_MEAN | TTPA_MIN | TTPA_MAX | TTPA_DIFF | UREA_MEDIAN | UREA_MEAN | UREA_MIN | UREA_MAX | UREA_DIFF | DIMER_MEDIAN | DIMER_MEAN | DIMER_MIN | DIMER_MAX | DIMER_DIFF | BLOODPRESSURE_DIASTOLIC_MEAN | BLOODPRESSURE_SISTOLIC_MEAN | HEART_RATE_MEAN | RESPIRATORY_RATE_MEAN | TEMPERATURE_MEAN | OXYGEN_SATURATION_MEAN | BLOODPRESSURE_DIASTOLIC_MEDIAN | BLOODPRESSURE_SISTOLIC_MEDIAN | HEART_RATE_MEDIAN | RESPIRATORY_RATE_MEDIAN | TEMPERATURE_MEDIAN | OXYGEN_SATURATION_MEDIAN | BLOODPRESSURE_DIASTOLIC_MIN | BLOODPRESSURE_SISTOLIC_MIN | HEART_RATE_MIN | RESPIRATORY_RATE_MIN | TEMPERATURE_MIN | OXYGEN_SATURATION_MIN | BLOODPRESSURE_DIASTOLIC_MAX | BLOODPRESSURE_SISTOLIC_MAX | HEART_RATE_MAX | RESPIRATORY_RATE_MAX | TEMPERATURE_MAX | OXYGEN_SATURATION_MAX | BLOODPRESSURE_DIASTOLIC_DIFF | BLOODPRESSURE_SISTOLIC_DIFF | HEART_RATE_DIFF | RESPIRATORY_RATE_DIFF | TEMPERATURE_DIFF | OXYGEN_SATURATION_DIFF | BLOODPRESSURE_DIASTOLIC_DIFF_REL | BLOODPRESSURE_SISTOLIC_DIFF_REL | HEART_RATE_DIFF_REL | RESPIRATORY_RATE_DIFF_REL | TEMPERATURE_DIFF_REL | OXYGEN_SATURATION_DIFF_REL | GO_ICU | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 183 | 199 | 0 | 10th | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
| 262 | 287 | 0 | 10th | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -0.111111 | -0.630769 | 0.509434 | -0.254237 | 0.107143 | 0.842105 | -0.111111 | -0.630769 | 0.509434 | -0.241379 | 0.107143 | 0.842105 | 0.072165 | -0.325 | 0.555556 | -0.142857 | 0.450549 | 0.939394 | -0.384615 | -0.740541 | 0.19403 | -0.333333 | -0.101449 | 0.842105 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 | 1 |
Observation:
There are two patients (199 and 287) who have not been filled up because some coulmns do not have any data in rows which belongs to that particular patient. The previous function needed at least one value in the column for each patient to fill the gaps. Because we are working on individual data of each person we will drop the data of these two patients.
# drop rows where at least one value is NaN
dataset = data.dropna(axis=0)
dataset = dataset.reset_index(drop=True)
check_for_missing_data_in_columns(dataset)
NaN values = 0 In total, there are 0 columns with missing values
dataset.shape
(351, 230)
After removing those two patients, data is now free from any missing values. The final shape of the dataset is:
# check for duplicate columns i.e columns having all values exactly equal
duplicated = dataset.T.duplicated()
count = 0
# The first instance of a column is marked as 'False', when the value is seen to be repeated in other
# column(s) it is marked as 'True'
for i in range(len(duplicated)):
if duplicated[i] == True:
print(duplicated.index[i], ":", duplicated[i])
count += 1
# 143 duplicate columns
print("total duplicate columns = ", count)
ALBUMIN_MEAN : True ALBUMIN_MIN : True ALBUMIN_MAX : True BE_ARTERIAL_MEAN : True BE_ARTERIAL_MIN : True BE_ARTERIAL_MAX : True BE_ARTERIAL_DIFF : True BE_VENOUS_MEAN : True BE_VENOUS_MIN : True BE_VENOUS_MAX : True BE_VENOUS_DIFF : True BIC_ARTERIAL_MEAN : True BIC_ARTERIAL_MIN : True BIC_ARTERIAL_MAX : True BIC_ARTERIAL_DIFF : True BIC_VENOUS_MEAN : True BIC_VENOUS_MIN : True BIC_VENOUS_MAX : True BIC_VENOUS_DIFF : True BILLIRUBIN_MEAN : True BILLIRUBIN_MIN : True BILLIRUBIN_MAX : True BILLIRUBIN_DIFF : True BLAST_MEAN : True BLAST_MIN : True BLAST_MAX : True BLAST_DIFF : True CALCIUM_MEAN : True CALCIUM_MIN : True CALCIUM_MAX : True CALCIUM_DIFF : True CREATININ_MEAN : True CREATININ_MIN : True CREATININ_MAX : True CREATININ_DIFF : True FFA_MEAN : True FFA_MIN : True FFA_MAX : True FFA_DIFF : True GGT_MEAN : True GGT_MIN : True GGT_MAX : True GGT_DIFF : True GLUCOSE_MEAN : True GLUCOSE_MIN : True GLUCOSE_MAX : True GLUCOSE_DIFF : True HEMATOCRITE_MEAN : True HEMATOCRITE_MIN : True HEMATOCRITE_MAX : True HEMATOCRITE_DIFF : True HEMOGLOBIN_MEAN : True HEMOGLOBIN_MIN : True HEMOGLOBIN_MAX : True HEMOGLOBIN_DIFF : True INR_MEAN : True INR_MIN : True INR_MAX : True INR_DIFF : True LACTATE_MEAN : True LACTATE_MIN : True LACTATE_MAX : True LACTATE_DIFF : True LEUKOCYTES_MEAN : True LEUKOCYTES_MIN : True LEUKOCYTES_MAX : True LEUKOCYTES_DIFF : True LINFOCITOS_MEAN : True LINFOCITOS_MIN : True LINFOCITOS_MAX : True LINFOCITOS_DIFF : True NEUTROPHILES_MEAN : True NEUTROPHILES_MIN : True NEUTROPHILES_MAX : True NEUTROPHILES_DIFF : True P02_ARTERIAL_MEAN : True P02_ARTERIAL_MIN : True P02_ARTERIAL_MAX : True P02_ARTERIAL_DIFF : True P02_VENOUS_MEAN : True P02_VENOUS_MIN : True P02_VENOUS_MAX : True P02_VENOUS_DIFF : True PC02_ARTERIAL_MEAN : True PC02_ARTERIAL_MIN : True PC02_ARTERIAL_MAX : True PC02_ARTERIAL_DIFF : True PC02_VENOUS_MEAN : True PC02_VENOUS_MIN : True PC02_VENOUS_MAX : True PC02_VENOUS_DIFF : True PCR_MEAN : True PCR_MIN : True PCR_MAX : True PCR_DIFF : True PH_ARTERIAL_MEAN : True PH_ARTERIAL_MIN : True PH_ARTERIAL_MAX : True PH_ARTERIAL_DIFF : True PH_VENOUS_MEAN : True PH_VENOUS_MIN : True PH_VENOUS_MAX : True PH_VENOUS_DIFF : True PLATELETS_MEAN : True PLATELETS_MIN : True PLATELETS_MAX : True PLATELETS_DIFF : True POTASSIUM_MEAN : True POTASSIUM_MIN : True POTASSIUM_MAX : True POTASSIUM_DIFF : True SAT02_ARTERIAL_MEAN : True SAT02_ARTERIAL_MIN : True SAT02_ARTERIAL_MAX : True SAT02_ARTERIAL_DIFF : True SAT02_VENOUS_MEAN : True SAT02_VENOUS_MIN : True SAT02_VENOUS_MAX : True SAT02_VENOUS_DIFF : True SODIUM_MEAN : True SODIUM_MIN : True SODIUM_MAX : True SODIUM_DIFF : True TGO_MEAN : True TGO_MIN : True TGO_MAX : True TGO_DIFF : True TGP_MEAN : True TGP_MIN : True TGP_MAX : True TGP_DIFF : True TTPA_MEAN : True TTPA_MIN : True TTPA_MAX : True TTPA_DIFF : True UREA_MEAN : True UREA_MIN : True UREA_MAX : True UREA_DIFF : True DIMER_MEAN : True DIMER_MIN : True DIMER_MAX : True DIMER_DIFF : True total duplicate columns = 143
dataset = dataset.T.drop_duplicates().T
# Drop columns with constant value
col = dataset.columns[dataset.nunique() <= 1].tolist()
print(col)
dataset.drop(columns = col, inplace=True)
['ALBUMIN_DIFF']
dataset.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 351 entries, 0 to 350 Data columns (total 86 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PATIENT_VISIT_IDENTIFIER 351 non-null object 1 AGE_ABOVE65 351 non-null object 2 AGE_PERCENTIL 351 non-null object 3 GENDER 351 non-null object 4 DISEASE GROUPING 1 351 non-null object 5 DISEASE GROUPING 2 351 non-null object 6 DISEASE GROUPING 3 351 non-null object 7 DISEASE GROUPING 4 351 non-null object 8 DISEASE GROUPING 5 351 non-null object 9 DISEASE GROUPING 6 351 non-null object 10 HTN 351 non-null object 11 IMMUNOCOMPROMISED 351 non-null object 12 OTHER 351 non-null object 13 ALBUMIN_MEDIAN 351 non-null object 14 BE_ARTERIAL_MEDIAN 351 non-null object 15 BE_VENOUS_MEDIAN 351 non-null object 16 BIC_ARTERIAL_MEDIAN 351 non-null object 17 BIC_VENOUS_MEDIAN 351 non-null object 18 BILLIRUBIN_MEDIAN 351 non-null object 19 BLAST_MEDIAN 351 non-null object 20 CALCIUM_MEDIAN 351 non-null object 21 CREATININ_MEDIAN 351 non-null object 22 FFA_MEDIAN 351 non-null object 23 GGT_MEDIAN 351 non-null object 24 GLUCOSE_MEDIAN 351 non-null object 25 HEMATOCRITE_MEDIAN 351 non-null object 26 HEMOGLOBIN_MEDIAN 351 non-null object 27 INR_MEDIAN 351 non-null object 28 LACTATE_MEDIAN 351 non-null object 29 LEUKOCYTES_MEDIAN 351 non-null object 30 LINFOCITOS_MEDIAN 351 non-null object 31 NEUTROPHILES_MEDIAN 351 non-null object 32 P02_ARTERIAL_MEDIAN 351 non-null object 33 P02_VENOUS_MEDIAN 351 non-null object 34 PC02_ARTERIAL_MEDIAN 351 non-null object 35 PC02_VENOUS_MEDIAN 351 non-null object 36 PCR_MEDIAN 351 non-null object 37 PH_ARTERIAL_MEDIAN 351 non-null object 38 PH_VENOUS_MEDIAN 351 non-null object 39 PLATELETS_MEDIAN 351 non-null object 40 POTASSIUM_MEDIAN 351 non-null object 41 SAT02_ARTERIAL_MEDIAN 351 non-null object 42 SAT02_VENOUS_MEDIAN 351 non-null object 43 SODIUM_MEDIAN 351 non-null object 44 TGO_MEDIAN 351 non-null object 45 TGP_MEDIAN 351 non-null object 46 TTPA_MEDIAN 351 non-null object 47 UREA_MEDIAN 351 non-null object 48 DIMER_MEDIAN 351 non-null object 49 BLOODPRESSURE_DIASTOLIC_MEAN 351 non-null object 50 BLOODPRESSURE_SISTOLIC_MEAN 351 non-null object 51 HEART_RATE_MEAN 351 non-null object 52 RESPIRATORY_RATE_MEAN 351 non-null object 53 TEMPERATURE_MEAN 351 non-null object 54 OXYGEN_SATURATION_MEAN 351 non-null object 55 BLOODPRESSURE_DIASTOLIC_MEDIAN 351 non-null object 56 BLOODPRESSURE_SISTOLIC_MEDIAN 351 non-null object 57 HEART_RATE_MEDIAN 351 non-null object 58 RESPIRATORY_RATE_MEDIAN 351 non-null object 59 TEMPERATURE_MEDIAN 351 non-null object 60 OXYGEN_SATURATION_MEDIAN 351 non-null object 61 BLOODPRESSURE_DIASTOLIC_MIN 351 non-null object 62 BLOODPRESSURE_SISTOLIC_MIN 351 non-null object 63 HEART_RATE_MIN 351 non-null object 64 RESPIRATORY_RATE_MIN 351 non-null object 65 TEMPERATURE_MIN 351 non-null object 66 OXYGEN_SATURATION_MIN 351 non-null object 67 BLOODPRESSURE_DIASTOLIC_MAX 351 non-null object 68 BLOODPRESSURE_SISTOLIC_MAX 351 non-null object 69 HEART_RATE_MAX 351 non-null object 70 RESPIRATORY_RATE_MAX 351 non-null object 71 TEMPERATURE_MAX 351 non-null object 72 OXYGEN_SATURATION_MAX 351 non-null object 73 BLOODPRESSURE_DIASTOLIC_DIFF 351 non-null object 74 BLOODPRESSURE_SISTOLIC_DIFF 351 non-null object 75 HEART_RATE_DIFF 351 non-null object 76 RESPIRATORY_RATE_DIFF 351 non-null object 77 TEMPERATURE_DIFF 351 non-null object 78 OXYGEN_SATURATION_DIFF 351 non-null object 79 BLOODPRESSURE_DIASTOLIC_DIFF_REL 351 non-null object 80 BLOODPRESSURE_SISTOLIC_DIFF_REL 351 non-null object 81 HEART_RATE_DIFF_REL 351 non-null object 82 RESPIRATORY_RATE_DIFF_REL 351 non-null object 83 TEMPERATURE_DIFF_REL 351 non-null object 84 OXYGEN_SATURATION_DIFF_REL 351 non-null object 85 GO_ICU 351 non-null object dtypes: object(86) memory usage: 236.0+ KB
dataset.shape
(351, 86)
Observation:
for column in dataset.columns:
try:
dataset[column] = dataset[column].astype("float64")
except ValueError:
pass
dataset['AGE_ABOVE65'] = dataset['AGE_ABOVE65'].astype('int64')
dataset['GENDER'] = dataset['GENDER'].astype('int64')
dataset['PATIENT_VISIT_IDENTIFIER'] = dataset['PATIENT_VISIT_IDENTIFIER'].astype('int64')
dataset['GO_ICU'] = dataset['GO_ICU'].astype('int64')
dataset.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 351 entries, 0 to 350 Data columns (total 86 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PATIENT_VISIT_IDENTIFIER 351 non-null int64 1 AGE_ABOVE65 351 non-null int64 2 AGE_PERCENTIL 351 non-null object 3 GENDER 351 non-null int64 4 DISEASE GROUPING 1 351 non-null float64 5 DISEASE GROUPING 2 351 non-null float64 6 DISEASE GROUPING 3 351 non-null float64 7 DISEASE GROUPING 4 351 non-null float64 8 DISEASE GROUPING 5 351 non-null float64 9 DISEASE GROUPING 6 351 non-null float64 10 HTN 351 non-null float64 11 IMMUNOCOMPROMISED 351 non-null float64 12 OTHER 351 non-null float64 13 ALBUMIN_MEDIAN 351 non-null float64 14 BE_ARTERIAL_MEDIAN 351 non-null float64 15 BE_VENOUS_MEDIAN 351 non-null float64 16 BIC_ARTERIAL_MEDIAN 351 non-null float64 17 BIC_VENOUS_MEDIAN 351 non-null float64 18 BILLIRUBIN_MEDIAN 351 non-null float64 19 BLAST_MEDIAN 351 non-null float64 20 CALCIUM_MEDIAN 351 non-null float64 21 CREATININ_MEDIAN 351 non-null float64 22 FFA_MEDIAN 351 non-null float64 23 GGT_MEDIAN 351 non-null float64 24 GLUCOSE_MEDIAN 351 non-null float64 25 HEMATOCRITE_MEDIAN 351 non-null float64 26 HEMOGLOBIN_MEDIAN 351 non-null float64 27 INR_MEDIAN 351 non-null float64 28 LACTATE_MEDIAN 351 non-null float64 29 LEUKOCYTES_MEDIAN 351 non-null float64 30 LINFOCITOS_MEDIAN 351 non-null float64 31 NEUTROPHILES_MEDIAN 351 non-null float64 32 P02_ARTERIAL_MEDIAN 351 non-null float64 33 P02_VENOUS_MEDIAN 351 non-null float64 34 PC02_ARTERIAL_MEDIAN 351 non-null float64 35 PC02_VENOUS_MEDIAN 351 non-null float64 36 PCR_MEDIAN 351 non-null float64 37 PH_ARTERIAL_MEDIAN 351 non-null float64 38 PH_VENOUS_MEDIAN 351 non-null float64 39 PLATELETS_MEDIAN 351 non-null float64 40 POTASSIUM_MEDIAN 351 non-null float64 41 SAT02_ARTERIAL_MEDIAN 351 non-null float64 42 SAT02_VENOUS_MEDIAN 351 non-null float64 43 SODIUM_MEDIAN 351 non-null float64 44 TGO_MEDIAN 351 non-null float64 45 TGP_MEDIAN 351 non-null float64 46 TTPA_MEDIAN 351 non-null float64 47 UREA_MEDIAN 351 non-null float64 48 DIMER_MEDIAN 351 non-null float64 49 BLOODPRESSURE_DIASTOLIC_MEAN 351 non-null float64 50 BLOODPRESSURE_SISTOLIC_MEAN 351 non-null float64 51 HEART_RATE_MEAN 351 non-null float64 52 RESPIRATORY_RATE_MEAN 351 non-null float64 53 TEMPERATURE_MEAN 351 non-null float64 54 OXYGEN_SATURATION_MEAN 351 non-null float64 55 BLOODPRESSURE_DIASTOLIC_MEDIAN 351 non-null float64 56 BLOODPRESSURE_SISTOLIC_MEDIAN 351 non-null float64 57 HEART_RATE_MEDIAN 351 non-null float64 58 RESPIRATORY_RATE_MEDIAN 351 non-null float64 59 TEMPERATURE_MEDIAN 351 non-null float64 60 OXYGEN_SATURATION_MEDIAN 351 non-null float64 61 BLOODPRESSURE_DIASTOLIC_MIN 351 non-null float64 62 BLOODPRESSURE_SISTOLIC_MIN 351 non-null float64 63 HEART_RATE_MIN 351 non-null float64 64 RESPIRATORY_RATE_MIN 351 non-null float64 65 TEMPERATURE_MIN 351 non-null float64 66 OXYGEN_SATURATION_MIN 351 non-null float64 67 BLOODPRESSURE_DIASTOLIC_MAX 351 non-null float64 68 BLOODPRESSURE_SISTOLIC_MAX 351 non-null float64 69 HEART_RATE_MAX 351 non-null float64 70 RESPIRATORY_RATE_MAX 351 non-null float64 71 TEMPERATURE_MAX 351 non-null float64 72 OXYGEN_SATURATION_MAX 351 non-null float64 73 BLOODPRESSURE_DIASTOLIC_DIFF 351 non-null float64 74 BLOODPRESSURE_SISTOLIC_DIFF 351 non-null float64 75 HEART_RATE_DIFF 351 non-null float64 76 RESPIRATORY_RATE_DIFF 351 non-null float64 77 TEMPERATURE_DIFF 351 non-null float64 78 OXYGEN_SATURATION_DIFF 351 non-null float64 79 BLOODPRESSURE_DIASTOLIC_DIFF_REL 351 non-null float64 80 BLOODPRESSURE_SISTOLIC_DIFF_REL 351 non-null float64 81 HEART_RATE_DIFF_REL 351 non-null float64 82 RESPIRATORY_RATE_DIFF_REL 351 non-null float64 83 TEMPERATURE_DIFF_REL 351 non-null float64 84 OXYGEN_SATURATION_DIFF_REL 351 non-null float64 85 GO_ICU 351 non-null int64 dtypes: float64(81), int64(4), object(1) memory usage: 236.0+ KB
for num in range (1, 10):
dataset["AGE_PERCENTIL"] = dataset["AGE_PERCENTIL"].replace([f'{num}0th'], f'{num}0')
dataset["AGE_PERCENTIL"] = dataset["AGE_PERCENTIL"].replace(['Above 90th'], '100')
IMPORTANT: In order to differentiate values "90th" and "above 90th", that second value will be represented as "100"
dataset["AGE_PERCENTIL"] = dataset["AGE_PERCENTIL"].astype(int)
dataset["AGE_PERCENTIL"].info()
<class 'pandas.core.series.Series'> RangeIndex: 351 entries, 0 to 350 Series name: AGE_PERCENTIL Non-Null Count Dtype -------------- ----- 351 non-null int64 dtypes: int64(1) memory usage: 2.9 KB
metadata = []
for feature in dataset.columns:
# Defining the role
if feature == 'GO_ICU':
use = 'target'
elif feature == 'PATIENT_VISIT_IDENTIFIER':
use = 'patient id'
else:
use = 'input'
# Defining the type
if 'GROUPING' in feature or 'GENDER' in feature or 'AGE_ABOVE65' in feature or feature == 'GO_ICU':
type = 'binary'
elif 'AGE_PERCENTIL' in feature or feature == 'PATIENT_VISIT_IDENTIFIER':
type = 'categorical'
elif dataset[feature].dtype == float or isinstance(dataset[feature].dtype, float):
type = 'real'
else:
type = 'integer'
# Defining the data type
dtype = dataset[feature].dtype
category = 'none'
# Defining the category
if 'DIFF' in feature:
category = 'vital_signs'
elif 'MAX' in feature:
category = 'vital_signs'
elif 'MIN' in feature:
category = 'vital_signs'
elif 'MEDIAN' in feature:
category = 'vital_signs'
elif 'MEAN' in feature:
category = 'vital_signs'
elif 'GROUPING' in feature:
category = 'disease_group'
# Creating a Dict that contains all the metadata for the variable
feature_dictionary = {
'varname': feature,
'use': use,
'type': type,
'dtype': dtype,
'category' : category
}
metadata.append(feature_dictionary)
meta = pd.DataFrame(metadata, columns=['varname', 'use', 'type', 'dtype', 'category'])
meta.set_index('varname', inplace=True)
meta
| use | type | dtype | category | |
|---|---|---|---|---|
| varname | ||||
| PATIENT_VISIT_IDENTIFIER | patient id | categorical | int64 | none |
| AGE_ABOVE65 | input | binary | int64 | none |
| AGE_PERCENTIL | input | categorical | int64 | none |
| GENDER | input | binary | int64 | none |
| DISEASE GROUPING 1 | input | binary | float64 | disease_group |
| DISEASE GROUPING 2 | input | binary | float64 | disease_group |
| DISEASE GROUPING 3 | input | binary | float64 | disease_group |
| DISEASE GROUPING 4 | input | binary | float64 | disease_group |
| DISEASE GROUPING 5 | input | binary | float64 | disease_group |
| DISEASE GROUPING 6 | input | binary | float64 | disease_group |
| HTN | input | real | float64 | none |
| IMMUNOCOMPROMISED | input | real | float64 | none |
| OTHER | input | real | float64 | none |
| ALBUMIN_MEDIAN | input | real | float64 | vital_signs |
| BE_ARTERIAL_MEDIAN | input | real | float64 | vital_signs |
| BE_VENOUS_MEDIAN | input | real | float64 | vital_signs |
| BIC_ARTERIAL_MEDIAN | input | real | float64 | vital_signs |
| BIC_VENOUS_MEDIAN | input | real | float64 | vital_signs |
| BILLIRUBIN_MEDIAN | input | real | float64 | vital_signs |
| BLAST_MEDIAN | input | real | float64 | vital_signs |
| CALCIUM_MEDIAN | input | real | float64 | vital_signs |
| CREATININ_MEDIAN | input | real | float64 | vital_signs |
| FFA_MEDIAN | input | real | float64 | vital_signs |
| GGT_MEDIAN | input | real | float64 | vital_signs |
| GLUCOSE_MEDIAN | input | real | float64 | vital_signs |
| HEMATOCRITE_MEDIAN | input | real | float64 | vital_signs |
| HEMOGLOBIN_MEDIAN | input | real | float64 | vital_signs |
| INR_MEDIAN | input | real | float64 | vital_signs |
| LACTATE_MEDIAN | input | real | float64 | vital_signs |
| LEUKOCYTES_MEDIAN | input | real | float64 | vital_signs |
| LINFOCITOS_MEDIAN | input | real | float64 | vital_signs |
| NEUTROPHILES_MEDIAN | input | real | float64 | vital_signs |
| P02_ARTERIAL_MEDIAN | input | real | float64 | vital_signs |
| P02_VENOUS_MEDIAN | input | real | float64 | vital_signs |
| PC02_ARTERIAL_MEDIAN | input | real | float64 | vital_signs |
| PC02_VENOUS_MEDIAN | input | real | float64 | vital_signs |
| PCR_MEDIAN | input | real | float64 | vital_signs |
| PH_ARTERIAL_MEDIAN | input | real | float64 | vital_signs |
| PH_VENOUS_MEDIAN | input | real | float64 | vital_signs |
| PLATELETS_MEDIAN | input | real | float64 | vital_signs |
| POTASSIUM_MEDIAN | input | real | float64 | vital_signs |
| SAT02_ARTERIAL_MEDIAN | input | real | float64 | vital_signs |
| SAT02_VENOUS_MEDIAN | input | real | float64 | vital_signs |
| SODIUM_MEDIAN | input | real | float64 | vital_signs |
| TGO_MEDIAN | input | real | float64 | vital_signs |
| TGP_MEDIAN | input | real | float64 | vital_signs |
| TTPA_MEDIAN | input | real | float64 | vital_signs |
| UREA_MEDIAN | input | real | float64 | vital_signs |
| DIMER_MEDIAN | input | real | float64 | vital_signs |
| BLOODPRESSURE_DIASTOLIC_MEAN | input | real | float64 | vital_signs |
| BLOODPRESSURE_SISTOLIC_MEAN | input | real | float64 | vital_signs |
| HEART_RATE_MEAN | input | real | float64 | vital_signs |
| RESPIRATORY_RATE_MEAN | input | real | float64 | vital_signs |
| TEMPERATURE_MEAN | input | real | float64 | vital_signs |
| OXYGEN_SATURATION_MEAN | input | real | float64 | vital_signs |
| BLOODPRESSURE_DIASTOLIC_MEDIAN | input | real | float64 | vital_signs |
| BLOODPRESSURE_SISTOLIC_MEDIAN | input | real | float64 | vital_signs |
| HEART_RATE_MEDIAN | input | real | float64 | vital_signs |
| RESPIRATORY_RATE_MEDIAN | input | real | float64 | vital_signs |
| TEMPERATURE_MEDIAN | input | real | float64 | vital_signs |
| OXYGEN_SATURATION_MEDIAN | input | real | float64 | vital_signs |
| BLOODPRESSURE_DIASTOLIC_MIN | input | real | float64 | vital_signs |
| BLOODPRESSURE_SISTOLIC_MIN | input | real | float64 | vital_signs |
| HEART_RATE_MIN | input | real | float64 | vital_signs |
| RESPIRATORY_RATE_MIN | input | real | float64 | vital_signs |
| TEMPERATURE_MIN | input | real | float64 | vital_signs |
| OXYGEN_SATURATION_MIN | input | real | float64 | vital_signs |
| BLOODPRESSURE_DIASTOLIC_MAX | input | real | float64 | vital_signs |
| BLOODPRESSURE_SISTOLIC_MAX | input | real | float64 | vital_signs |
| HEART_RATE_MAX | input | real | float64 | vital_signs |
| RESPIRATORY_RATE_MAX | input | real | float64 | vital_signs |
| TEMPERATURE_MAX | input | real | float64 | vital_signs |
| OXYGEN_SATURATION_MAX | input | real | float64 | vital_signs |
| BLOODPRESSURE_DIASTOLIC_DIFF | input | real | float64 | vital_signs |
| BLOODPRESSURE_SISTOLIC_DIFF | input | real | float64 | vital_signs |
| HEART_RATE_DIFF | input | real | float64 | vital_signs |
| RESPIRATORY_RATE_DIFF | input | real | float64 | vital_signs |
| TEMPERATURE_DIFF | input | real | float64 | vital_signs |
| OXYGEN_SATURATION_DIFF | input | real | float64 | vital_signs |
| BLOODPRESSURE_DIASTOLIC_DIFF_REL | input | real | float64 | vital_signs |
| BLOODPRESSURE_SISTOLIC_DIFF_REL | input | real | float64 | vital_signs |
| HEART_RATE_DIFF_REL | input | real | float64 | vital_signs |
| RESPIRATORY_RATE_DIFF_REL | input | real | float64 | vital_signs |
| TEMPERATURE_DIFF_REL | input | real | float64 | vital_signs |
| OXYGEN_SATURATION_DIFF_REL | input | real | float64 | vital_signs |
| GO_ICU | target | binary | int64 | none |
dataset.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 351 entries, 0 to 350 Data columns (total 86 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PATIENT_VISIT_IDENTIFIER 351 non-null int64 1 AGE_ABOVE65 351 non-null int64 2 AGE_PERCENTIL 351 non-null int64 3 GENDER 351 non-null int64 4 DISEASE GROUPING 1 351 non-null float64 5 DISEASE GROUPING 2 351 non-null float64 6 DISEASE GROUPING 3 351 non-null float64 7 DISEASE GROUPING 4 351 non-null float64 8 DISEASE GROUPING 5 351 non-null float64 9 DISEASE GROUPING 6 351 non-null float64 10 HTN 351 non-null float64 11 IMMUNOCOMPROMISED 351 non-null float64 12 OTHER 351 non-null float64 13 ALBUMIN_MEDIAN 351 non-null float64 14 BE_ARTERIAL_MEDIAN 351 non-null float64 15 BE_VENOUS_MEDIAN 351 non-null float64 16 BIC_ARTERIAL_MEDIAN 351 non-null float64 17 BIC_VENOUS_MEDIAN 351 non-null float64 18 BILLIRUBIN_MEDIAN 351 non-null float64 19 BLAST_MEDIAN 351 non-null float64 20 CALCIUM_MEDIAN 351 non-null float64 21 CREATININ_MEDIAN 351 non-null float64 22 FFA_MEDIAN 351 non-null float64 23 GGT_MEDIAN 351 non-null float64 24 GLUCOSE_MEDIAN 351 non-null float64 25 HEMATOCRITE_MEDIAN 351 non-null float64 26 HEMOGLOBIN_MEDIAN 351 non-null float64 27 INR_MEDIAN 351 non-null float64 28 LACTATE_MEDIAN 351 non-null float64 29 LEUKOCYTES_MEDIAN 351 non-null float64 30 LINFOCITOS_MEDIAN 351 non-null float64 31 NEUTROPHILES_MEDIAN 351 non-null float64 32 P02_ARTERIAL_MEDIAN 351 non-null float64 33 P02_VENOUS_MEDIAN 351 non-null float64 34 PC02_ARTERIAL_MEDIAN 351 non-null float64 35 PC02_VENOUS_MEDIAN 351 non-null float64 36 PCR_MEDIAN 351 non-null float64 37 PH_ARTERIAL_MEDIAN 351 non-null float64 38 PH_VENOUS_MEDIAN 351 non-null float64 39 PLATELETS_MEDIAN 351 non-null float64 40 POTASSIUM_MEDIAN 351 non-null float64 41 SAT02_ARTERIAL_MEDIAN 351 non-null float64 42 SAT02_VENOUS_MEDIAN 351 non-null float64 43 SODIUM_MEDIAN 351 non-null float64 44 TGO_MEDIAN 351 non-null float64 45 TGP_MEDIAN 351 non-null float64 46 TTPA_MEDIAN 351 non-null float64 47 UREA_MEDIAN 351 non-null float64 48 DIMER_MEDIAN 351 non-null float64 49 BLOODPRESSURE_DIASTOLIC_MEAN 351 non-null float64 50 BLOODPRESSURE_SISTOLIC_MEAN 351 non-null float64 51 HEART_RATE_MEAN 351 non-null float64 52 RESPIRATORY_RATE_MEAN 351 non-null float64 53 TEMPERATURE_MEAN 351 non-null float64 54 OXYGEN_SATURATION_MEAN 351 non-null float64 55 BLOODPRESSURE_DIASTOLIC_MEDIAN 351 non-null float64 56 BLOODPRESSURE_SISTOLIC_MEDIAN 351 non-null float64 57 HEART_RATE_MEDIAN 351 non-null float64 58 RESPIRATORY_RATE_MEDIAN 351 non-null float64 59 TEMPERATURE_MEDIAN 351 non-null float64 60 OXYGEN_SATURATION_MEDIAN 351 non-null float64 61 BLOODPRESSURE_DIASTOLIC_MIN 351 non-null float64 62 BLOODPRESSURE_SISTOLIC_MIN 351 non-null float64 63 HEART_RATE_MIN 351 non-null float64 64 RESPIRATORY_RATE_MIN 351 non-null float64 65 TEMPERATURE_MIN 351 non-null float64 66 OXYGEN_SATURATION_MIN 351 non-null float64 67 BLOODPRESSURE_DIASTOLIC_MAX 351 non-null float64 68 BLOODPRESSURE_SISTOLIC_MAX 351 non-null float64 69 HEART_RATE_MAX 351 non-null float64 70 RESPIRATORY_RATE_MAX 351 non-null float64 71 TEMPERATURE_MAX 351 non-null float64 72 OXYGEN_SATURATION_MAX 351 non-null float64 73 BLOODPRESSURE_DIASTOLIC_DIFF 351 non-null float64 74 BLOODPRESSURE_SISTOLIC_DIFF 351 non-null float64 75 HEART_RATE_DIFF 351 non-null float64 76 RESPIRATORY_RATE_DIFF 351 non-null float64 77 TEMPERATURE_DIFF 351 non-null float64 78 OXYGEN_SATURATION_DIFF 351 non-null float64 79 BLOODPRESSURE_DIASTOLIC_DIFF_REL 351 non-null float64 80 BLOODPRESSURE_SISTOLIC_DIFF_REL 351 non-null float64 81 HEART_RATE_DIFF_REL 351 non-null float64 82 RESPIRATORY_RATE_DIFF_REL 351 non-null float64 83 TEMPERATURE_DIFF_REL 351 non-null float64 84 OXYGEN_SATURATION_DIFF_REL 351 non-null float64 85 GO_ICU 351 non-null int64 dtypes: float64(81), int64(5) memory usage: 236.0 KB
ICU_prop_main = dataset.groupby('GO_ICU')['PATIENT_VISIT_IDENTIFIER'].count().reset_index()
labels = ["not-admitted", "admitted"]
plt.title('ICU admissions proportion', fontdict= {'fontsize' : 16}, pad=100)
plt.pie(ICU_prop_main['PATIENT_VISIT_IDENTIFIER'],textprops={'fontsize': 12},radius =2, labels = labels, startangle=90, autopct=lambda p : '{:.2f}% ({:,.0f}patients)'.format(p,p * sum(ICU_prop_main['PATIENT_VISIT_IDENTIFIER'])/100))
plt.show()
#Age distribution according to AGE_PERCENTILE
AGE_prop_percentil = dataset.groupby('AGE_PERCENTIL')['PATIENT_VISIT_IDENTIFIER'].count().reset_index()
AGE_prop_percentil.head()
plt.figure(figsize=(9,6))
plt.bar(AGE_prop_percentil["AGE_PERCENTIL"],AGE_prop_percentil["PATIENT_VISIT_IDENTIFIER"], width = 8)
plt.xticks(AGE_prop_percentil["AGE_PERCENTIL"])
plt.ylabel("Patients Count")
plt.xlabel("Percentile")
plt.title('Age distribution according to AGE_PERCENTILE', fontdict= {'fontsize' : 14})
plt.show()
def create_bar_to_icu(feature_name):
ax = sns.countplot(x=feature_name, hue="GO_ICU", data=dataset)
for i in ax.containers:
ax.bar_label(i,)
create_bar_to_icu("AGE_ABOVE65")
AGE_65_ICU = dataset[dataset['GO_ICU'] == 1]
AGE_65_ICU = AGE_65_ICU.groupby('AGE_ABOVE65')['PATIENT_VISIT_IDENTIFIER'].count().reset_index()
labels = ["Below-65", "Above-65"]
plt.title('ICU admissions proportion for ages below/above 65', fontdict= {'fontsize' : 16}, pad=100)
plt.pie(AGE_65_ICU['PATIENT_VISIT_IDENTIFIER'],textprops={'fontsize': 12},radius =2, labels = labels, startangle=90, autopct=lambda p : '{:.2f}% ({:,.0f}patients)'.format(p,p * sum(AGE_65_ICU['PATIENT_VISIT_IDENTIFIER'])/100))
plt.show()
AGE_prop_percentil = dataset[dataset['GO_ICU'] == 1]
AGE_prop_percentil = AGE_prop_percentil.groupby('AGE_PERCENTIL')['PATIENT_VISIT_IDENTIFIER'].count().reset_index()
AGE_prop_percentil.head()
plt.figure(figsize=(9,6))
plt.bar(AGE_prop_percentil["AGE_PERCENTIL"],AGE_prop_percentil["PATIENT_VISIT_IDENTIFIER"], width = 8)
plt.xticks(AGE_prop_percentil["AGE_PERCENTIL"])
plt.ylabel("Patients Count")
plt.xlabel("Age Percentile")
plt.title('ICU admissions proportion according to AGE_PERCENTILE', fontdict= {'fontsize' : 14})
plt.show()
Observations:
def corr_heatmap(sample, masking=False):
sns.set_style('whitegrid')
# Create color map ranging between two colors
cmap = sns.diverging_palette(50, 10, as_cmap=True)
fig, ax = plt.subplots(figsize=(10,10))
if masking==False:
correlations = sample.corr()
sns.heatmap(correlations, cmap=cmap, vmax=1.0, center=0, fmt='.2f',
square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .75})
else:
correlations = np.triu(sample.corr())
sns.heatmap(sample.corr(), cmap=cmap, vmax=1.0, center=0, fmt='.2f',
square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .75},
mask=correlations)
plt.show();
sample = dataset
var = ['AGE_PERCENTIL', 'AGE_ABOVE65', 'GO_ICU']
sample = sample[var]
corr_heatmap(sample)
Observation:
"AGE_PERCENTIL" is going to be used for future modeling since it has higher correlation (0.36) with the target, than "AGE_ABOVE65" (0.29)
dataset.drop(columns = 'AGE_ABOVE65', inplace=True)
create_bar_to_icu("GENDER")
Observation:
create_bar_to_icu("HTN")
create_bar_to_icu("IMMUNOCOMPROMISED")
create_bar_to_icu("OTHER")
Observations:
All 3 features are not going to be helpful with modeling due to high data imbalance
col = ["OTHER", "IMMUNOCOMPROMISED", "HTN"]
dataset.drop(columns = col, inplace=True)
disease_groups = meta[meta.category == 'disease_group'].index
fig, axes = plt.subplots(ncols=2,nrows=3,figsize=(15,10))
col = 0
for i in range(len(disease_groups)):
x = sns.countplot(x=disease_groups[i], hue="GO_ICU", data=dataset, ax = axes[int((i)/2)][col])
col = (col+1) % 2
Observations:
dataset.drop(columns = disease_groups, inplace=True)
vital_signs = meta[meta.category == 'vital_signs'].index
vital_signs_ICU = vital_signs.append(meta[meta.use == 'target'].index)
vital_signs_df = dataset[vital_signs_ICU]
f,ax = plt.subplots(figsize=(23,23))
sns.heatmap(vital_signs_df.drop(columns=['GO_ICU']).corr())
plt.show()
rows_list = []
for f in vital_signs:
v1 = vital_signs_df[ vital_signs_df['GO_ICU']==1 ][f].var()
v0 = vital_signs_df[ vital_signs_df['GO_ICU']==0 ][f].var()
dict1 = {
'feature' : f,
'icu' : v1,
'non-icu' : v0
}
rows_list.append(dict1)
var_vital_signs = pd.DataFrame(rows_list)
fig = plt.subplots(figsize=(20,8))
ax = sns.lineplot(data=var_vital_signs, x="feature", y="icu", color='r')
sns.lineplot(data=var_vital_signs, x="feature", y="non-icu", color='g')
plt.xticks(rotation=90)
plt.show()
Observation:
print(f"final dataset shape is:{dataset.shape}")
final dataset shape is:(351, 76)
dataset.drop(columns = 'PATIENT_VISIT_IDENTIFIER', inplace=True)
dataset.describe()
| AGE_PERCENTIL | GENDER | ALBUMIN_MEDIAN | BE_ARTERIAL_MEDIAN | BE_VENOUS_MEDIAN | BIC_ARTERIAL_MEDIAN | BIC_VENOUS_MEDIAN | BILLIRUBIN_MEDIAN | BLAST_MEDIAN | CALCIUM_MEDIAN | CREATININ_MEDIAN | FFA_MEDIAN | GGT_MEDIAN | GLUCOSE_MEDIAN | HEMATOCRITE_MEDIAN | HEMOGLOBIN_MEDIAN | INR_MEDIAN | LACTATE_MEDIAN | LEUKOCYTES_MEDIAN | LINFOCITOS_MEDIAN | NEUTROPHILES_MEDIAN | P02_ARTERIAL_MEDIAN | P02_VENOUS_MEDIAN | PC02_ARTERIAL_MEDIAN | PC02_VENOUS_MEDIAN | PCR_MEDIAN | PH_ARTERIAL_MEDIAN | PH_VENOUS_MEDIAN | PLATELETS_MEDIAN | POTASSIUM_MEDIAN | SAT02_ARTERIAL_MEDIAN | SAT02_VENOUS_MEDIAN | SODIUM_MEDIAN | TGO_MEDIAN | TGP_MEDIAN | TTPA_MEDIAN | UREA_MEDIAN | DIMER_MEDIAN | BLOODPRESSURE_DIASTOLIC_MEAN | BLOODPRESSURE_SISTOLIC_MEAN | HEART_RATE_MEAN | RESPIRATORY_RATE_MEAN | TEMPERATURE_MEAN | OXYGEN_SATURATION_MEAN | BLOODPRESSURE_DIASTOLIC_MEDIAN | BLOODPRESSURE_SISTOLIC_MEDIAN | HEART_RATE_MEDIAN | RESPIRATORY_RATE_MEDIAN | TEMPERATURE_MEDIAN | OXYGEN_SATURATION_MEDIAN | BLOODPRESSURE_DIASTOLIC_MIN | BLOODPRESSURE_SISTOLIC_MIN | HEART_RATE_MIN | RESPIRATORY_RATE_MIN | TEMPERATURE_MIN | OXYGEN_SATURATION_MIN | BLOODPRESSURE_DIASTOLIC_MAX | BLOODPRESSURE_SISTOLIC_MAX | HEART_RATE_MAX | RESPIRATORY_RATE_MAX | TEMPERATURE_MAX | OXYGEN_SATURATION_MAX | BLOODPRESSURE_DIASTOLIC_DIFF | BLOODPRESSURE_SISTOLIC_DIFF | HEART_RATE_DIFF | RESPIRATORY_RATE_DIFF | TEMPERATURE_DIFF | OXYGEN_SATURATION_DIFF | BLOODPRESSURE_DIASTOLIC_DIFF_REL | BLOODPRESSURE_SISTOLIC_DIFF_REL | HEART_RATE_DIFF_REL | RESPIRATORY_RATE_DIFF_REL | TEMPERATURE_DIFF_REL | OXYGEN_SATURATION_DIFF_REL | GO_ICU | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 | 351.000000 |
| mean | 52.905983 | 0.381766 | 0.554956 | -0.985557 | -0.949424 | -0.316517 | -0.317374 | -0.944460 | -0.993483 | 0.332771 | -0.893073 | -0.726528 | -0.930316 | -0.856552 | -0.133667 | -0.172548 | -0.940769 | 0.449888 | -0.760812 | -0.736898 | -0.825891 | -0.177290 | -0.686570 | -0.778092 | -0.759099 | -0.834459 | 0.232679 | 0.373277 | -0.474968 | -0.545426 | 0.921804 | 0.313619 | -0.072401 | -0.993738 | -0.985877 | -0.830554 | -0.840145 | -0.952026 | -0.062471 | -0.327484 | -0.249250 | -0.463956 | 0.085013 | 0.739482 | -0.065946 | -0.332360 | -0.250379 | -0.458829 | 0.081629 | 0.743359 | 0.012693 | -0.170462 | -0.227380 | -0.475258 | 0.356068 | 0.834863 | -0.244617 | -0.430086 | -0.298933 | -0.386558 | 0.006104 | 0.802169 | -0.807403 | -0.799338 | -0.804988 | -0.777917 | -0.809298 | -0.910798 | -0.837553 | -0.789700 | -0.857572 | -0.787122 | -0.810324 | -0.910752 | 0.461538 |
| std | 28.816458 | 0.486513 | 0.149646 | 0.072823 | 0.118326 | 0.055182 | 0.096368 | 0.060816 | 0.106031 | 0.078625 | 0.089030 | 0.147544 | 0.131480 | 0.104911 | 0.209338 | 0.222004 | 0.051919 | 0.734231 | 0.114210 | 0.139272 | 0.111417 | 0.090416 | 0.127270 | 0.049589 | 0.075473 | 0.200942 | 0.066200 | 0.098439 | 0.201132 | 0.151616 | 0.113045 | 0.276295 | 0.187040 | 0.017217 | 0.015785 | 0.061872 | 0.104319 | 0.132466 | 0.222431 | 0.258340 | 0.246603 | 0.178930 | 0.213823 | 0.145499 | 0.225319 | 0.258794 | 0.255383 | 0.182155 | 0.216619 | 0.147381 | 0.208583 | 0.220084 | 0.233925 | 0.188489 | 0.149017 | 0.158410 | 0.185468 | 0.208580 | 0.222574 | 0.251018 | 0.200912 | 0.149406 | 0.185209 | 0.194405 | 0.175738 | 0.236197 | 0.153417 | 0.157520 | 0.164183 | 0.199713 | 0.121888 | 0.213955 | 0.151840 | 0.157670 | 0.499230 |
| min | 10.000000 | 0.000000 | -0.263158 | -1.000000 | -1.000000 | -0.756098 | -1.000000 | -0.992674 | -1.000000 | 0.030612 | -0.970276 | -0.927505 | -0.997664 | -0.929236 | -0.903564 | -0.871951 | -0.991217 | -0.975884 | -0.966010 | -0.977178 | -0.990796 | -0.878049 | -0.988166 | -0.958621 | -1.000000 | -1.000000 | -0.489362 | -0.318182 | -0.991989 | -0.962963 | -0.575758 | -0.925926 | -0.714286 | -0.999627 | -0.999619 | -0.961853 | -0.978313 | -1.000000 | -0.654321 | -0.880959 | -0.811321 | -0.932203 | -0.464286 | -1.000000 | -0.679012 | -0.884615 | -0.811321 | -0.931034 | -0.464286 | -1.000000 | -0.587629 | -0.862500 | -0.803419 | -0.964286 | 0.032967 | -0.060606 | -0.760684 | -0.870270 | -0.850746 | -0.939394 | -0.565217 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | 0.000000 |
| 25% | 30.000000 | 0.000000 | 0.605263 | -1.000000 | -1.000000 | -0.317073 | -0.317073 | -0.962498 | -1.000000 | 0.318027 | -0.928521 | -0.757996 | -0.958528 | -0.891993 | -0.250524 | -0.299797 | -0.959849 | 0.027331 | -0.834299 | -0.830221 | -0.897359 | -0.170732 | -0.704142 | -0.779310 | -0.760736 | -0.967013 | 0.234043 | 0.363636 | -0.603471 | -0.629630 | 0.939394 | 0.345679 | -0.171429 | -0.996711 | -0.992759 | -0.846633 | -0.896386 | -0.983001 | -0.192842 | -0.501207 | -0.407943 | -0.559322 | -0.077267 | 0.689493 | -0.197531 | -0.504167 | -0.415094 | -0.551724 | -0.075893 | 0.688596 | -0.120275 | -0.314583 | -0.367521 | -0.571429 | 0.252747 | 0.838384 | -0.361823 | -0.567568 | -0.445274 | -0.555556 | -0.130435 | 0.736842 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | 0.000000 |
| 50% | 50.000000 | 0.000000 | 0.605263 | -1.000000 | -1.000000 | -0.317073 | -0.317073 | -0.941218 | -1.000000 | 0.357143 | -0.909413 | -0.742004 | -0.958528 | -0.891993 | -0.132075 | -0.170732 | -0.959849 | 1.000000 | -0.788142 | -0.765560 | -0.851140 | -0.170732 | -0.704142 | -0.779310 | -0.754601 | -0.894140 | 0.234043 | 0.363636 | -0.484646 | -0.555556 | 0.939394 | 0.345679 | -0.028571 | -0.995521 | -0.988377 | -0.846633 | -0.869880 | -0.978029 | -0.061728 | -0.369231 | -0.266799 | -0.506053 | 0.063492 | 0.747829 | -0.061728 | -0.369231 | -0.273585 | -0.505747 | 0.059524 | 0.754386 | 0.024055 | -0.187500 | -0.253561 | -0.500000 | 0.333333 | 0.878788 | -0.247863 | -0.452252 | -0.318408 | -0.484848 | 0.007246 | 0.828947 | -0.826087 | -0.815951 | -0.816794 | -0.852941 | -0.809524 | -0.949495 | -0.857708 | -0.807261 | -0.859410 | -0.847670 | -0.808466 | -0.950579 | 0.000000 |
| 75% | 80.000000 | 1.000000 | 0.605263 | -1.000000 | -0.950262 | -0.317073 | -0.317073 | -0.938950 | -1.000000 | 0.357143 | -0.881104 | -0.742004 | -0.954634 | -0.855680 | -0.002096 | -0.036585 | -0.933501 | 1.000000 | -0.714465 | -0.676349 | -0.784914 | -0.170732 | -0.704142 | -0.779310 | -0.754601 | -0.784121 | 0.234043 | 0.378788 | -0.375834 | -0.481481 | 0.939394 | 0.345679 | 0.057143 | -0.994448 | -0.985137 | -0.830673 | -0.821687 | -0.964518 | 0.077444 | -0.184035 | -0.128039 | -0.413506 | 0.214675 | 0.812427 | 0.086420 | -0.193590 | -0.135220 | -0.419540 | 0.214286 | 0.815789 | 0.134021 | -0.062500 | -0.136752 | -0.428571 | 0.428571 | 0.909091 | -0.128205 | -0.300450 | -0.171642 | -0.237374 | 0.140097 | 0.877193 | -0.721739 | -0.705521 | -0.716285 | -0.647059 | -0.706349 | -0.909091 | -0.773592 | -0.687866 | -0.791729 | -0.664875 | -0.708330 | -0.909404 | 1.000000 |
| max | 100.000000 | 1.000000 | 0.789474 | 0.000000 | 0.000000 | 0.341463 | 0.341463 | 0.030525 | 0.969834 | 0.693878 | 0.232838 | 0.680171 | 1.000000 | 0.452514 | 0.811321 | 0.804878 | -0.355082 | 1.000000 | -0.020471 | 0.080913 | -0.070028 | 0.731707 | 0.372781 | 0.103448 | 0.122699 | 0.535350 | 0.617021 | 0.818182 | 0.345794 | 0.222222 | 1.000000 | 1.000000 | 1.000000 | -0.693944 | -0.794970 | -0.341573 | 0.050602 | 1.000000 | 1.000000 | 1.000000 | 0.905660 | 0.830508 | 0.964286 | 1.000000 | 1.000000 | 1.000000 | 0.962264 | 0.862069 | 0.964286 | 1.000000 | 1.000000 | 1.000000 | 0.914530 | 1.000000 | 0.978022 | 1.000000 | 0.384615 | 0.405405 | 0.552239 | 0.636364 | 0.681159 | 1.000000 | 0.130435 | 0.411043 | 0.160305 | 0.058824 | -0.095238 | -0.010101 | 0.130435 | 0.378657 | -0.343264 | -0.054337 | -0.124649 | -0.000219 | 1.000000 |
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=0.04) # Setting threshold of 4% low variance features
selector.fit(dataset.drop(['GO_ICU'], axis=1)) # Fit to train without target variables
f = np.vectorize(lambda x : not x) # Function to toggle boolean array elements
low_variance = dataset.drop(['GO_ICU'], axis=1).columns[f(selector.get_support())]
print('{} variables have too low variance.'.format(len(low_variance)))
print('These variables are {}'.format(list(low_variance)))
49 variables have too low variance. These variables are ['ALBUMIN_MEDIAN', 'BE_ARTERIAL_MEDIAN', 'BE_VENOUS_MEDIAN', 'BIC_ARTERIAL_MEDIAN', 'BIC_VENOUS_MEDIAN', 'BILLIRUBIN_MEDIAN', 'BLAST_MEDIAN', 'CALCIUM_MEDIAN', 'CREATININ_MEDIAN', 'FFA_MEDIAN', 'GGT_MEDIAN', 'GLUCOSE_MEDIAN', 'INR_MEDIAN', 'LEUKOCYTES_MEDIAN', 'LINFOCITOS_MEDIAN', 'NEUTROPHILES_MEDIAN', 'P02_ARTERIAL_MEDIAN', 'P02_VENOUS_MEDIAN', 'PC02_ARTERIAL_MEDIAN', 'PC02_VENOUS_MEDIAN', 'PH_ARTERIAL_MEDIAN', 'PH_VENOUS_MEDIAN', 'POTASSIUM_MEDIAN', 'SAT02_ARTERIAL_MEDIAN', 'SODIUM_MEDIAN', 'TGO_MEDIAN', 'TGP_MEDIAN', 'TTPA_MEDIAN', 'UREA_MEDIAN', 'DIMER_MEDIAN', 'RESPIRATORY_RATE_MEAN', 'OXYGEN_SATURATION_MEAN', 'RESPIRATORY_RATE_MEDIAN', 'OXYGEN_SATURATION_MEDIAN', 'RESPIRATORY_RATE_MIN', 'TEMPERATURE_MIN', 'OXYGEN_SATURATION_MIN', 'BLOODPRESSURE_DIASTOLIC_MAX', 'OXYGEN_SATURATION_MAX', 'BLOODPRESSURE_DIASTOLIC_DIFF', 'BLOODPRESSURE_SISTOLIC_DIFF', 'HEART_RATE_DIFF', 'TEMPERATURE_DIFF', 'OXYGEN_SATURATION_DIFF', 'BLOODPRESSURE_DIASTOLIC_DIFF_REL', 'BLOODPRESSURE_SISTOLIC_DIFF_REL', 'HEART_RATE_DIFF_REL', 'TEMPERATURE_DIFF_REL', 'OXYGEN_SATURATION_DIFF_REL']
dataset.drop(columns = low_variance, inplace=True)
dataset.shape
(351, 26)
Observation:
dataset.columns
Index(['AGE_PERCENTIL', 'GENDER', 'HEMATOCRITE_MEDIAN', 'HEMOGLOBIN_MEDIAN',
'LACTATE_MEDIAN', 'PCR_MEDIAN', 'PLATELETS_MEDIAN',
'SAT02_VENOUS_MEDIAN', 'BLOODPRESSURE_DIASTOLIC_MEAN',
'BLOODPRESSURE_SISTOLIC_MEAN', 'HEART_RATE_MEAN', 'TEMPERATURE_MEAN',
'BLOODPRESSURE_DIASTOLIC_MEDIAN', 'BLOODPRESSURE_SISTOLIC_MEDIAN',
'HEART_RATE_MEDIAN', 'TEMPERATURE_MEDIAN',
'BLOODPRESSURE_DIASTOLIC_MIN', 'BLOODPRESSURE_SISTOLIC_MIN',
'HEART_RATE_MIN', 'BLOODPRESSURE_SISTOLIC_MAX', 'HEART_RATE_MAX',
'RESPIRATORY_RATE_MAX', 'TEMPERATURE_MAX', 'RESPIRATORY_RATE_DIFF',
'RESPIRATORY_RATE_DIFF_REL', 'GO_ICU'],
dtype='object')
# Features most important for prediction
corr_features = dataset.corr()['GO_ICU'].sort_values(ascending=False).head(20).index[1:]
corr_features
Index(['RESPIRATORY_RATE_MAX', 'RESPIRATORY_RATE_DIFF',
'RESPIRATORY_RATE_DIFF_REL', 'AGE_PERCENTIL', 'PCR_MEDIAN',
'BLOODPRESSURE_SISTOLIC_MAX', 'BLOODPRESSURE_SISTOLIC_MEAN',
'TEMPERATURE_MAX', 'BLOODPRESSURE_SISTOLIC_MEDIAN',
'TEMPERATURE_MEDIAN', 'TEMPERATURE_MEAN', 'HEART_RATE_MAX',
'BLOODPRESSURE_SISTOLIC_MIN', 'HEART_RATE_MEAN', 'HEART_RATE_MEDIAN',
'HEART_RATE_MIN', 'PLATELETS_MEDIAN', 'SAT02_VENOUS_MEDIAN', 'GENDER'],
dtype='object')
sample = dataset
var = corr_features
sample = sample[var]
corr_heatmap(sample)
Obsrevation:
corr_features
Index(['RESPIRATORY_RATE_MAX', 'RESPIRATORY_RATE_DIFF',
'RESPIRATORY_RATE_DIFF_REL', 'AGE_PERCENTIL', 'PCR_MEDIAN',
'BLOODPRESSURE_SISTOLIC_MAX', 'BLOODPRESSURE_SISTOLIC_MEAN',
'TEMPERATURE_MAX', 'BLOODPRESSURE_SISTOLIC_MEDIAN',
'TEMPERATURE_MEDIAN', 'TEMPERATURE_MEAN', 'HEART_RATE_MAX',
'BLOODPRESSURE_SISTOLIC_MIN', 'HEART_RATE_MEAN', 'HEART_RATE_MEDIAN',
'HEART_RATE_MIN', 'PLATELETS_MEDIAN', 'SAT02_VENOUS_MEDIAN', 'GENDER'],
dtype='object')
X = dataset[corr_features]
y = dataset["GO_ICU"]
Strategy: After some testing, it turned out that respiratory rate and its variations, are quite important for model because of it's correlation with the target, so I have assumend that keeping correlated features with each other will still help the model with learning. I have decided to keep top 10 most correlated features with the target even if they are correlated with different features, if I will found feature which is correlated with different feature and it is outside top 10, then I will drop it.
# HEAD(20) - 85.5% - Random Forest
# col = ['RESPIRATORY_RATE_DIFF', 'RESPIRATORY_RATE_DIFF_REL', 'BLOODPRESSURE_SISTOLIC_MEAN', 'BLOODPRESSURE_SISTOLIC_MEDIAN', "TEMPERATURE_MEAN", "HEART_RATE_MEAN", "HEART_RATE_MEDIAN", "HEART_RATE_MIN","BLOODPRESSURE_SISTOLIC_MIN"]
# X.drop(columns = col, inplace=True)
Observation to the code commented out above:
# HEAD(20) - 93.4% - Random Forest
col = ["TEMPERATURE_MEAN", "HEART_RATE_MEAN", "HEART_RATE_MEDIAN", "HEART_RATE_MIN","BLOODPRESSURE_SISTOLIC_MIN"]
X.drop(columns = col, inplace=True)
/var/folders/94/vt0_m_ds4ldfdtdt98l0j_cr0000gn/T/ipykernel_83979/2721181433.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
sample = X
corr_heatmap(sample)
X.columns
Index(['RESPIRATORY_RATE_MAX', 'RESPIRATORY_RATE_DIFF',
'RESPIRATORY_RATE_DIFF_REL', 'AGE_PERCENTIL', 'PCR_MEDIAN',
'BLOODPRESSURE_SISTOLIC_MAX', 'BLOODPRESSURE_SISTOLIC_MEAN',
'TEMPERATURE_MAX', 'BLOODPRESSURE_SISTOLIC_MEDIAN',
'TEMPERATURE_MEDIAN', 'HEART_RATE_MAX', 'PLATELETS_MEDIAN',
'SAT02_VENOUS_MEDIAN', 'GENDER'],
dtype='object')
X.shape
(351, 14)
I have chosen oversampling since we do not have many samples
ax = sns.countplot(x="GO_ICU", data=dataset)
for i in ax.containers:
ax.bar_label(i,)
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X, y = ros.fit_resample(X, y)
ax = sns.countplot(x=y, data=dataset)
for i in ax.containers:
ax.bar_label(i,)
norm = MinMaxScaler().fit(X)
X_norm = norm.transform(X)
X_train,X_test,y_train,y_test = train_test_split(X_norm,y,test_size=0.2,random_state=42)
clfs = {"LogisticRegression":LogisticRegression(),
"SVM":SVC(kernel='rbf', probability=True),
"Decision":DecisionTreeClassifier(),
"RandomForest":RandomForestClassifier(),
"GradientBoost":GradientBoostingClassifier(),
"XGBoost":XGBClassifier(verbosity=0),
"CatBoost":CatBoostClassifier(verbose=False)}
def model_fit(clfs):
fitted_model={}
model_result = pd.DataFrame()
for model_name, model in clfs.items():
model.fit(X_train,y_train)
fitted_model.update({model_name:model})
y_pred = model.predict(X_test)
model_dict = {}
model_dict['1.Algorithm'] = model_name
model_dict['2.Accuracy'] = round(accuracy_score(y_test, y_pred),3)
model_dict['3.Precision'] = round(precision_score(y_test, y_pred),3)
model_dict['4.Recall'] = round(recall_score(y_test, y_pred),3)
model_dict['5.F1'] = round(f1_score(y_test, y_pred),3)
model_dict['6.ROC'] = round(roc_auc_score(y_test, y_pred),3)
model_result = model_result.append(model_dict,ignore_index=True)
return fitted_model, model_result
fitted_model, model_result = model_fit(clfs)
/var/folders/94/vt0_m_ds4ldfdtdt98l0j_cr0000gn/T/ipykernel_83979/111864206.py:15: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. /var/folders/94/vt0_m_ds4ldfdtdt98l0j_cr0000gn/T/ipykernel_83979/111864206.py:15: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. /var/folders/94/vt0_m_ds4ldfdtdt98l0j_cr0000gn/T/ipykernel_83979/111864206.py:15: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. /var/folders/94/vt0_m_ds4ldfdtdt98l0j_cr0000gn/T/ipykernel_83979/111864206.py:15: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. /var/folders/94/vt0_m_ds4ldfdtdt98l0j_cr0000gn/T/ipykernel_83979/111864206.py:15: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. /var/folders/94/vt0_m_ds4ldfdtdt98l0j_cr0000gn/T/ipykernel_83979/111864206.py:15: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. /var/folders/94/vt0_m_ds4ldfdtdt98l0j_cr0000gn/T/ipykernel_83979/111864206.py:15: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
model_result.sort_values(by=['2.Accuracy'],ascending=False)
| 1.Algorithm | 2.Accuracy | 3.Precision | 4.Recall | 5.F1 | 6.ROC | |
|---|---|---|---|---|---|---|
| 3 | RandomForest | 0.934 | 0.921 | 0.946 | 0.933 | 0.935 |
| 6 | CatBoost | 0.921 | 0.919 | 0.919 | 0.919 | 0.921 |
| 4 | GradientBoost | 0.908 | 0.895 | 0.919 | 0.907 | 0.908 |
| 5 | XGBoost | 0.908 | 0.917 | 0.892 | 0.904 | 0.907 |
| 2 | Decision | 0.882 | 0.868 | 0.892 | 0.880 | 0.882 |
| 0 | LogisticRegression | 0.816 | 0.829 | 0.784 | 0.806 | 0.815 |
| 1 | SVM | 0.803 | 0.867 | 0.703 | 0.776 | 0.800 |
Observation:
model_result["1.Algorithm"][2:]
2 Decision 3 RandomForest 4 GradientBoost 5 XGBoost 6 CatBoost Name: 1.Algorithm, dtype: object
model_name = model_result["1.Algorithm"][2:]
for model in model_name:
feat_imp = pd.Series(fitted_model.get(model).feature_importances_, index=X.columns)
feat_imp = feat_imp.nlargest(15).sort_values()
feat_imp.plot(kind="barh", title="Feature Importance ({:})".format(model))
plt.show()
Observations: